# Enviorment Installation 
In order to load dataset and run the code, you need to install the following packages:
Dependencies (with python >= 3.9): Main dependencies are
pytorch==1.13
torch_geometric==2.2.0
torch-scatter==2.1.1+pt113cpu
torch-sparse==0.6.17+pt113cpu
torch-spline-conv==1.2.2+pt113cpu

# Dataset Download 
There are three available datasets for link prediction: Cora, Pubmed and Arxiv. Please download them from the following link:



In [1]:
from yacs.config import CfgNode as CN
import numpy as np
import torch
import random
import os, sys 
import pandas as pd 
import json
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from sklearn.preprocessing import normalize

cfg = CN()

cfg.dataset = CN()
cfg.dataset.cora = CN()
cfg.dataset.cora.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.cora.original = cfg.dataset.cora.root + '/dataset/cora_orig/cora'
cfg.dataset.cora.papers =  cfg.dataset.cora.root + '/dataset/cora_orig/mccallum/cora/papers'
cfg.dataset.cora.extractions =  cfg.dataset.cora.root + '/dataset/cora_andrew_mccallum/extractions/'
cfg.dataset.cora.lm_model_name = 'microsoft/deberta-base'
# ------------------------------------------------------------------------ #
cfg.dataset.pubmed = CN()
cfg.dataset.pubmed.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.pubmed.original = cfg.dataset.pubmed.root  + '/dataset/PubMed_orig/data/'
cfg.dataset.pubmed.abs_ti = cfg.dataset.pubmed.root  + '/dataset/PubMed_orig/pubmed.json' 

cfg.dataset.arxiv = CN()
cfg.dataset.arxiv.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.arxiv.abs_ti = cfg.dataset.arxiv.root + '/dataset/ogbn_arxiv_orig/titleabs.tsv'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cfg

CfgNode({'dataset': CfgNode({'cora': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'original': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_orig/cora', 'papers': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_orig/mccallum/cora/papers', 'extractions': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_andrew_mccallum/extractions/', 'lm_model_name': 'microsoft/deberta-base'}), 'pubmed': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'original': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/PubMed_orig/data/', 'abs_ti': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/PubMed_orig/pubmed.json'}), 'arxiv': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'abs_ti': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/ogbn_arxiv_orig/titleabs.tsv'})})})

In [3]:
def seed_everything(SEED=0):
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

In [4]:
def load_ab_ti(path, fn):
    ti, ab = '', ''
    with open(path + fn) as f:
        lines = f.read().splitlines()
    for line in lines:
        if line.split(':')[0] == 'Title':
            ti = line
        elif line.split(':')[0] == 'Abstract':
            ab = line
    return ti, ab

In [5]:
def get_raw_text_cora(cfg, use_text=False, seed=0):
    # load data 
    path_papers = cfg.dataset.cora.papers
    andrew_maccallum_path = cfg.dataset.cora.extractions 
    dataset = Planetoid(cfg.dataset.cora.root, 'cora',
                        transform=T.NormalizeFeatures())
    data = dataset[0]
    print(data)

    # load data_citeid 
    path = cfg.dataset.cora.original
    idx_features_labels = np.genfromtxt(
        "{}.content".format(path), dtype=np.dtype(str))
    data_X = idx_features_labels[:, 1:-1].astype(np.float32)
    labels = idx_features_labels[:, -1]
    data_citeid = idx_features_labels[:, 0]
    
    if not use_text:
        return data, None

    with open(path_papers) as f:
        lines = f.readlines()
    pid_filename = {}
    for line in lines:
        pid = line.split('\t')[0]
        fn = line.split('\t')[1]
        pid_filename[pid] = fn

    text = []
    whole, founded = len(data_citeid), 0
    no_ab_or_ti = 0
    for pid in data_citeid:
        fn = pid_filename[pid]
        ti, ab = load_ab_ti(andrew_maccallum_path, fn)
        founded += 1
        text.append(ti + '\n' + ab)

        if ti == '' or ab == '':
            # print(f"no title {ti}, no abstract {ab}")
            no_ab_or_ti += 1
    print(f"found {founded}/{whole} papers, {no_ab_or_ti} no ab or ti.")
    return data, text


In [6]:
data, text = get_raw_text_cora(cfg, use_text=True)
print(data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
found 2708/2708 papers, 321 no ab or ti.
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [7]:
print(text[0])

Title: The megaprior heuristic for discovering protein sequence patterns  
Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in propo

In [7]:
from ogb.nodeproppred import PygNodePropPredDataset
import torch_geometric.transforms as T
import torch
import pandas as pd

In [9]:
def get_raw_text_pubmed(cfg, use_text=False, seed=0):
    ######## data, data_pubid 
    # data, data_pubid = get_pubmed_casestudy(SEED=seed)
    path = cfg.dataset.pubmed.original

    n_nodes = 19717
    n_features = 500

    data_X = np.zeros((n_nodes, n_features), dtype='float32')
    data_Y = [None] * n_nodes
    data_pubid = [None] * n_nodes
    data_edges = []

    paper_to_index = {}
    feature_to_index = {}

    # parse nodes
    with open(path + 'Pubmed-Diabetes.NODE.paper.tab', 'r') as node_file:
        # first two lines are headers
        node_file.readline()
        node_file.readline()

        k = 0

        for i, line in enumerate(node_file.readlines()):
            items = line.strip().split('\t')

            paper_id = items[0]
            data_pubid[i] = paper_id
            paper_to_index[paper_id] = i

            # label=[1,2,3]
            label = int(items[1].split('=')[-1]) - \
                1  # subtract 1 to zero-count
            data_Y[i] = label

            # f1=val1 \t f2=val2 \t ... \t fn=valn summary=...
            features = items[2:-1]
            for feature in features:
                parts = feature.split('=')
                fname = parts[0]
                fvalue = float(parts[1])

                if fname not in feature_to_index:
                    feature_to_index[fname] = k
                    k += 1

                data_X[i, feature_to_index[fname]] = fvalue

    # parse graph
    data_A = np.zeros((n_nodes, n_nodes), dtype='float32')

    with open(path + 'Pubmed-Diabetes.DIRECTED.cites.tab', 'r') as edge_file:
        # first two lines are headers
        edge_file.readline()
        edge_file.readline()

        for i, line in enumerate(edge_file.readlines()):

            # edge_id \t paper:tail \t | \t paper:head
            items = line.strip().split('\t')

            edge_id = items[0]

            tail = items[1].split(':')[-1]
            head = items[3].split(':')[-1]

            data_A[paper_to_index[tail], paper_to_index[head]] = 1.0
            data_A[paper_to_index[head], paper_to_index[tail]] = 1.0
            if head != tail:
                data_edges.append(
                    (paper_to_index[head], paper_to_index[tail]))
                data_edges.append(
                    (paper_to_index[tail], paper_to_index[head]))
              
    data_edges = np.unique(data_edges, axis=0).transpose()

    ###########
    data_X = normalize(data_X, norm="l1")

    # load data
    data_name = 'PubMed'
    # path = osp.join(osp.dirname(osp.realpath(__file__)), 'dataset')
    dataset = Planetoid(cfg.dataset.pubmed.root, data_name) # , transform=T.NormalizeFeatures()
    data = dataset[0]

    # replace dataset matrices with the PubMed-Diabetes data, for which we have the original pubmed IDs
    data.x = torch.tensor(data_X)
    data.edge_index = torch.tensor(data_edges)
    data.y = torch.tensor(data_Y)

    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    ########
    if not use_text:
        return data, None
    
    f = open(cfg.dataset.pubmed.abs_ti)
    pubmed = json.load(f)
    df_pubmed = pd.DataFrame.from_dict(pubmed)

    AB = df_pubmed['AB'].fillna("")
    TI = df_pubmed['TI'].fillna("")
    text = []
    for ti, ab in zip(TI, AB):
        t = 'Title: ' + ti + '\n'+'Abstract: ' + ab
        text.append(t)        
    return data, text

In [10]:
data, text = get_raw_text_pubmed(cfg, use_text=True, seed=0)
print(data)
print(text[0])

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717], train_id=[11830], val_id=[3943], test_id=[3944])
Title: Retinal metabolic abnormalities in diabetic mouse: comparison with diabetic rat.
Abstract: PURPOSE: Dogs and rats are commonly used to examine the pathogenesis of diabetic retinopathy, but mouse is sparingly studied as an animal model of diabetic retinopathy. In this study metabolic abnormalities, postulated to contribute to the development of retinopathy in diabetes, are investigated in the retina of mice diabetic or galactose-fed for 2 months, and are compared to those obtained from hyperglycemic rats. METHODS: Diabetes was induced in mice (C57BL/6) and rats (Sprague Dawley) by alloxan injection, and experimental galactosemia by feeding normal animals diets supplemented with 30% galactose. After 2 months of hyperglycemia, levels of lipid peroxides, glutathione, nitric oxides and sorbitol, and activities of protein kina

In [11]:
print(len(text))

19717


In [5]:
def get_raw_text_arxiv(cfg, use_text=False, seed=0):
    # dataset = PygNodePropPredDataset(
    #     name='ogbn-arxiv', transform=T.ToSparseTensor())
    dataset = PygNodePropPredDataset(
        name='ogbn-arxiv')
    data = dataset[0]

    idx_splits = dataset.get_idx_split()
    train_mask = torch.zeros(data.num_nodes).bool()
    val_mask = torch.zeros(data.num_nodes).bool()
    test_mask = torch.zeros(data.num_nodes).bool()
    train_mask[idx_splits['train']] = True
    val_mask[idx_splits['valid']] = True
    test_mask[idx_splits['test']] = True
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    # data.edge_index = data.adj_t.to_symmetric()
    if not use_text:
        return data, None

    nodeidx2paperid = pd.read_csv(
        'dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz', compression='gzip')

    raw_text = pd.read_csv(cfg.dataset.arxiv.abs_ti,
                           sep='\t', header=None, names=['paper id', 'title', 'abs'])

    # remove string paper id
    nodeidx2paperid['paper id'] = nodeidx2paperid['paper id'].astype(int)
    raw_text = raw_text.dropna()
    raw_text.loc[1:, 'paper id'] = raw_text[1:]['paper id'].astype(int)
    df = pd.merge(nodeidx2paperid, raw_text[1:], on='paper id')
    text = []
    for ti, ab in zip(df['title'], df['abs']):
        t = 'Title: ' + ti + '\n' + 'Abstract: ' + ab
        text.append(t)
    return data, text

In [12]:
data, text = get_raw_text_arxiv(cfg, use_text=True)
print(data)
type(text)

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:12<00:00,  6.71it/s]


Extracting dataset/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 10058.28it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 72.83it/s]

Saving...



Done!


Data(num_nodes=169343, edge_index=[2, 1166243], x=[169343, 128], node_year=[169343, 1], y=[169343, 1], train_mask=[169343], val_mask=[169343], test_mask=[169343])


list

In [14]:
len(data)

8