# DeepWalk and Text Baseline Models

In [1]:
# !pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html
# !pip install dgl-cu101
!pip install dgl-cu101 dglgo -f https://data.dgl.ai/wheels/repo.html

Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl-cu101
  Downloading https://data.dgl.ai/wheels/dgl_cu101-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (150.0 MB)
[K     |████████████████████████████████| 150.0 MB 8.6 kB/s 
[?25hCollecting dglgo
  Downloading dglgo-0.0.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.4 MB/s 
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 9.8 MB/s 
[?25hCollecting autopep8>=1.6.0
  Downloading autopep8-1.6.0-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.8 MB/s 
[?25hCollecting ruamel.yaml>=0.17.20
  Downloading ruamel.yaml-0.17.21-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 48.4 MB/s 
[?25hCollecting numpydoc>=1.1.0
  Downloading numpydoc-1.2.1-py3-none-any.whl (51 kB)
[K     |█████████

In [4]:
import torch 
import networkx as nx
import numpy as np
import scipy.sparse as sp
import dgl
import random

from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
#Seeds
dgl.seed(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fe6735aa430>

In [5]:
PATH_TO_GRAPH_FILE = "cit-HepTh.txt"
graphAdjList = []
with open(PATH_TO_GRAPH_FILE, 'r') as f:
    L = f.readlines()
    for line_ in L:
        if "#" in line_:
            continue 
        src, dst = map(lambda x: int(x), line_.strip().split('\t'))
        graphAdjList.append([src,dst])

### Loading Data into NetworkX

In [6]:
nx_g = nx.DiGraph()
nx_g.add_edges_from(graphAdjList)

paper_to_node = {node:index for index, node in enumerate(sorted(nx_g.nodes())) }
node_to_paper = {v:k for k, v in paper_to_node.items()}

In [7]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [8]:
g = dgl.from_networkx(nx_g).to(device)

In [9]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

print("train_size: {}, test_size: {}".format(train_size, test_size))

train_size: 317527, test_size: 35280


In [10]:
# Find all negative edges and split them for training and testing
neg_u, neg_v = dgl.sampling.global_uniform_negative_sampling(g, g.number_of_edges())
test_neg_u, test_neg_v = neg_u[:test_size], neg_v[:test_size]
train_neg_u, train_neg_v = neg_u[test_size:], neg_v[test_size:]

In [11]:
# Removing test edges from graph
train_g = dgl.remove_edges(g, eids[:test_size])
nx_train_g = dgl.to_networkx(train_g.cpu())

In [12]:
print("Number of nodes: {}, Number of edges: {}".format(nx_train_g.number_of_nodes(), nx_train_g.number_of_edges()))

Number of nodes: 27770, Number of edges: 317527


### Train Deepwalk

In [13]:
!pip install install karateclub

Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting karateclub
  Downloading karateclub-1.2.3.tar.gz (62 kB)
[K     |████████████████████████████████| 62 kB 534 kB/s 
Collecting pygsp
  Downloading PyGSP-0.5.1-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 11.1 MB/s 
[?25hCollecting gensim>=4.0.0
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 6.2 MB/s 
Building wheels for collected packages: karateclub, python-Levenshtein
  Building wheel for karateclub (setup.py) ... [?25l[?25hdone
  Created wheel for karateclub: filename=karateclub-1.2.3-py3-none-any.whl size=97754 sha256=bedc4e0e093f1128731ae9b3321c211d65cfbc2977621dd124bef019d4f833a8
  Stored in directory: /root/.cache/pip/wh

In [14]:
from karateclub import DeepWalk, Node2Vec

In [15]:
# node2id = {node:i for i, node in enumerate(sorted(nx_g.nodes()))}
node2id = {node:i for i, node in enumerate(sorted(nx_g.nodes()))}
id2node = {v:k for k, v in paper_to_node.items()}

In [16]:
model = DeepWalk()
# model = Node2Vec()
model.fit(nx_train_g)
node_embedding = model.get_embedding()

In [17]:
print("Node Embedding Shape: {}".format(node_embedding.shape))

Node Embedding Shape: (27770, 128)


In [18]:
np.savetxt('deepwalk_embeddings.txt', node_embedding)

### Get Text Embeddings

In [19]:
!pip install --upgrade --no-cache-dir gdown &> /dev/null
!gdown 1OxLLkPAeaC10Q3Ko-WN74ov_4zhmoPff
# https://drive.google.com/file/d/1OxLLkPAeaC10Q3Ko-WN74ov_4zhmoPff/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1OxLLkPAeaC10Q3Ko-WN74ov_4zhmoPff
To: /content/sentence_transformers_embeddings.pkl
100% 47.6M/47.6M [00:00<00:00, 207MB/s]


In [20]:
import pickle
TEXT_EMBEDDING_FILE = "sentence_transformers_embeddings.pkl"

In [21]:
with open(TEXT_EMBEDDING_FILE, 'rb') as f:
    text_embeddings = pickle.load(f)
text_embeddings = {int(k) : v for k, v in text_embeddings.items()} #Convert keys to integer values

### PyTorch Dataset

In [22]:
class CitationDataset(Dataset):
    def __init__(self, graph, edges_u, edges_v, labels, node_embedding, text_embedding, node2id, id2node):
        self.graph = graph
        self.edges_u = edges_u
        self.edges_v = edges_v
        self.labels = labels
        self.node2id = node2id
        self.id2node = id2node
        self.node_embedding = node_embedding
        self.text_embedding = text_embedding
    
    def __len__(self):
        return len(self.edges_u)
    
    def __getitem__(self, idx):
        u = self.edges_u[idx]
        v = self.edges_v[idx]
        node_u_emb = torch.tensor(self.node_embedding[u])
        node_v_emb = torch.tensor(self.node_embedding[v])
        
        text_u_emb = torch.tensor(self.text_embedding[id2node[u.item()]])
        text_v_emb = torch.tensor(self.text_embedding[id2node[v.item()]])
        
        label = self.labels[idx]
        return node_u_emb, node_v_emb, text_u_emb, text_v_emb, label

In [23]:
train_u = torch.concat((train_pos_u, train_neg_u), dim=0)
train_v = torch.concat((train_pos_v, train_neg_v), dim=0)
train_label = torch.cat([torch.ones(train_pos_u.shape[0]), torch.zeros(train_neg_u.shape[0])])
train_label = train_label.long()
train_u.shape, train_v.shape, train_label.shape, train_label.type()

(torch.Size([635054]),
 torch.Size([635054]),
 torch.Size([635054]),
 'torch.LongTensor')

In [24]:
test_u = torch.concat((test_pos_u, test_neg_u), dim=0)
test_v = torch.concat((test_pos_v, test_neg_v), dim=0)
test_label = torch.cat([torch.ones(test_pos_u.shape[0]), torch.zeros(test_pos_v.shape[0])])
test_label = test_label.long()
test_u.shape, test_v.shape, test_label.shape, test_label.type()

(torch.Size([70560]),
 torch.Size([70560]),
 torch.Size([70560]),
 'torch.LongTensor')

In [25]:
train_dataset = CitationDataset(nx_train_g, train_u, train_v, train_label, node_embedding, text_embeddings, node2id, id2node)
test_dataset = CitationDataset(nx_g, test_u, test_v, test_label, node_embedding, text_embeddings, node2id, id2node)

In [26]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

##### Dataset Test Loop

In [27]:
for i, data in enumerate(train_loader):
    node_u, node_v, text_u, text_v, label = data
    print("node_u: {}, node_v: {}".format(node_u.size(), node_v.size()))
    print("text_u: {}, text_v: {}".format(text_u.size(), text_v.size()))
    print("label.size(): {}".format(label.size()))
    break

node_u: torch.Size([256, 128]), node_v: torch.Size([256, 128])
text_u: torch.Size([256, 384]), text_v: torch.Size([256, 384])
label.size(): torch.Size([256])


## Model Classes

### DeepWalk Embeddings

In [28]:
class DeepWalkBaseline(nn.Module):
    def __init__(self, node_emb_dim=128, hidden_dim=128):
        super().__init__()
        self.node_emb_dim = node_emb_dim
        self.linear1 = nn.Linear(2 * self.node_emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 2)
        self.activation = nn.ReLU()
    
    def forward(self, node_u, text_u, node_v, text_v):
        node_combined = torch.cat((node_u, node_v), dim=1) # N x (2 * node_dim)
        _out = self.activation(self.linear1(node_combined)) 
        _out = self.linear2(_out) # N x 2
        return _out

### Text Embeddings

In [29]:
class TextBaseline(nn.Module):
    def __init__(self, text_emb_dim=384, hidden_dim=128):
        super().__init__()
        self.text_emb_dim = text_emb_dim
        self.linear1 = nn.Linear(2 * self.text_emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 2)
        self.activation = nn.ReLU()
    
    def forward(self, node_u, text_u, node_v, text_v):
        node_combined = torch.cat((text_u, text_v), dim=1) # N x (2 * text_dim)
        _out = self.activation(self.linear1(node_combined)) 
        _out = self.linear2(_out) # N x 2
        return _out

### Combined Embeddings

In [30]:
# Node & Text Embedding Combined Model
class NTEC(nn.Module):
    def __init__(self, node_emb_dim=128, text_emb_dim=384, hidden_dim=128):
        super().__init__()
        self.node_emb_dim = node_emb_dim        
        self.text_emb_dim = text_emb_dim
        self.linear1 = nn.Linear(2*(self.text_emb_dim + self.node_emb_dim), 128)
        self.linear2 = nn.Linear(hidden_dim, 2)
        self.activation = nn.ReLU()
    
    def forward(self, node_u, text_u, node_v, text_v):
        node_combined = torch.cat((node_u, text_u, node_v, text_v), dim=1) # N x (2 * (node_dim + text_dim))
        _out = self.activation(self.linear1(node_combined)) 
        _out = self.linear2(_out) # N x 2
        return _out    

### Metric Definition and Training Loop

In [31]:
def compute_metrics(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    # roc_auc = roc_auc_score(labels, preds[:, 1])
    roc_auc = 1
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return roc_auc, acc, precision, recall, f1

In [32]:
def train(model, train_loader, test_loader, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        pred_labels_epoch = []
        true_labels_epoch = []
        train_loss_epoch = 0.0
        outs = []
        # outs = np.array(outs)
        for i, data in enumerate(train_loader):
            # if i >= 5:
            #     break
          
            node_u, node_v, text_u, text_v, label = data
            node_u = node_u.to(device)
            node_v = node_v.to(device)
            text_u = text_u.to(device)
            text_v = text_v.to(device)
            label = label.to(device)
            
            out = model(node_u, text_u, node_v, text_v)
            outs.extend(out[:, 1].detach().cpu().numpy())
            preds = torch.argmax(out, dim=1)
            loss = criterion(out, label)
            train_loss_epoch += loss.item()
            
            pred_labels_epoch.extend(list(preds.detach().cpu().numpy()))
            true_labels_epoch.extend(list(label.detach().cpu().numpy()))
            
            # backward pass
            loss.backward()
            optimizer.step()
                    
        # Compute train metrics
        roc_auc, acc, precision, recall, f1 = compute_metrics(pred_labels_epoch, true_labels_epoch)
        outs = np.array(outs)
        roc_auc = roc_auc_score(true_labels_epoch, outs)
        print("TRAIN, Epoch number: ", epoch)
        print("Num labels: {}".format(len(pred_labels_epoch)))
        print("Loss: {}".format(train_loss_epoch / len(pred_labels_epoch)))
        print("roc_auc: {}, acc: {}, precision: {}, recall: {}, f1: {}".format(roc_auc, acc, precision, recall, f1))
        evaluate(model, test_loader)
        print("-"*100)

### Evaluation Function

In [33]:
def evaluate(model, dataloader):
    model.eval()
    with torch.no_grad():
        pred_labels_epoch = []
        true_labels_epoch = []
        outs = []        
        for i, data in enumerate(dataloader):
            node_u, node_v, text_u, text_v, label = data
            
            node_u, node_v, text_u, text_v, label = data
            node_u = node_u.to(device)
            node_v = node_v.to(device)
            text_u = text_u.to(device)
            text_v = text_v.to(device)
            label = label.to(device)

            out = model(node_u, text_u, node_v, text_v)
            outs.extend(out[:, 1].detach().cpu().numpy())
            preds = torch.argmax(out, dim=1)

            pred_labels_epoch.extend(list(preds.detach().cpu().numpy()))
            true_labels_epoch.extend(list(label.detach().cpu().numpy()))
        
        roc_auc, acc, precision, recall, f1 = compute_metrics(pred_labels_epoch, true_labels_epoch)
        outs = np.array(outs)
        roc_auc = roc_auc_score(true_labels_epoch, outs)
        print("TEST")
        print("roc_auc: {}, acc: {}, precision: {}, recall: {}, f1: {}".format(roc_auc, acc, precision, recall, f1))

## Model Training

In [34]:
model = DeepWalkBaseline()
# model = TextBaseline()
# model = NTEC()
model = model.to(device)

In [35]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
train(model, train_loader, test_loader, num_epochs=30)

TRAIN, Epoch number:  0
Num labels: 635054
Loss: 0.0021814378612129556
roc_auc: 0.7778319580734262, acc: 0.7132259618867057, precision: 0.7237530900101788, recall: 0.6897019780995002, f1: 0.7063173766156277
TEST
roc_auc: 0.7790869452703605, acc: 0.696797052154195, precision: 0.6921884515307535, recall: 0.7087868480725623, f1: 0.7003893230260761
----------------------------------------------------------------------------------------------------
TRAIN, Epoch number:  1
Num labels: 635054
Loss: 0.0019137103459948485
roc_auc: 0.838812592627987, acc: 0.7537563734737518, precision: 0.7584412918456175, recall: 0.7446925773241331, f1: 0.7515040569014998
TEST
roc_auc: 0.7920679614910968, acc: 0.7231434240362812, precision: 0.7238095238095238, recall: 0.721655328798186, f1: 0.7227308210914769
----------------------------------------------------------------------------------------------------
TRAIN, Epoch number:  2
Num labels: 635054
Loss: 0.0018002767590903353
roc_auc: 0.8612507700236458, acc: 

## Model Evaluation

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc, RocCurveDisplay

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    fpr, tpr, thresholds = roc_curve(labels, scores)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Link Prediction')
    display.plot()
    return roc_auc