# Prediction on Academic Citation Networks

#### References : https://colab.research.google.com/drive/14OvFnAXggxB8vM4e8vSURUp1TaKnovzX#scrollTo=8zOh6IIeI3Op

In [1]:
import requests
import os
from torch_geometric.data import Dataset
from torch_geometric.utils import to_networkx
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import torch
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import copy

In [2]:
class HW3Dataset(Dataset):
    url = 'https://technionmail-my.sharepoint.com/:u:/g/personal/ploznik_campus_technion_ac_il/EUHUDSoVnitIrEA6ALsAK1QBpphP5jX3OmGyZAgnbUFo0A?download=1'

    def __init__(self, root, transform=None, pre_transform=None):
        super(HW3Dataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return ['data.pt']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        file_url = self.url.replace(' ', '%20')
        response = requests.get(file_url)

        if response.status_code != 200:
            raise Exception(f"Failed to download the file, status code: {response.status_code}")

        with open(os.path.join(self.raw_dir, self.raw_file_names[0]), 'wb') as f:
            f.write(response.content)

    def process(self):
        raw_path = os.path.join(self.raw_dir, self.raw_file_names[0])
        data = torch.load(raw_path)
        torch.save(data, self.processed_paths[0])

    def len(self):
        return 1

    def get(self, idx):
        return torch.load(self.processed_paths[0])

In [4]:
dataset = HW3Dataset(root='data/hw3_new/')
data = dataset[0]
print(data)

Data(x=[100000, 128], edge_index=[2, 444288], y=[100000, 1], node_year=[100000, 1], train_mask=[80000], val_mask=[20000])


### Network Analysis on the Graph

In [5]:
# Gather some statistics about the graph
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.shape[0]}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Number of nodes: 100000
Number of edges: 444288
Average node degree: 4.44
Number of training nodes: 80000
Training node label rate: 31999.60
Has self-loops: False
Is undirected: False


In [58]:
G = to_networkx(data, to_undirected=False)

In [59]:
results = {
    "ACC": [nx.average_clustering(G)],
    "Average In Degree": [np.mean(np.array([G.in_degree(n) for n in G.nodes()]))],
    "Average Out Degree": [np.mean(np.array([G.out_degree(n) for n in G.nodes()]))],
}

results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.columns = ["Our Graph"]
results_df

Unnamed: 0,Our Graph
ACC,0.121265
Average In Degree,4.44288
Average Out Degree,4.44288


### GCN with Tanh activation function and its' train_eval

In [6]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels//2)
        self.classifier = Linear(hidden_channels//2, dataset.num_classes)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()
        
        out = self.classifier(h)

        return out

model = GCN(32)
print(model)

GCN(
  (conv1): GCNConv(128, 32)
  (conv2): GCNConv(32, 32)
  (conv3): GCNConv(32, 16)
  (classifier): Linear(in_features=16, out_features=40, bias=True)
)


#### Tanh Hidden channels = 16

In [54]:
np.random.seed(323)
model = GCN(16) # changing the hidden channels number to a value from [16,32,64]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc

total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.7067
Epoch: 11, Loss: 3.2249
Epoch: 21, Loss: 2.9345
Epoch: 31, Loss: 2.7440
Epoch: 41, Loss: 2.5635
Epoch: 51, Loss: 2.3945
Epoch: 61, Loss: 2.2556
Epoch: 71, Loss: 2.1358
Epoch: 81, Loss: 2.0409
Epoch: 91, Loss: 1.9668
Epoch: 101, Loss: 1.9086
Epoch: 111, Loss: 1.8635
Epoch: 121, Loss: 1.8320
Epoch: 131, Loss: 1.8019
Epoch: 141, Loss: 1.7795
Epoch: 151, Loss: 1.7658
Epoch: 161, Loss: 1.7490
Epoch: 171, Loss: 1.7354
Epoch: 181, Loss: 1.7400
Epoch: 191, Loss: 1.7161
Epoch: 201, Loss: 1.7074
Epoch: 211, Loss: 1.6989
Epoch: 221, Loss: 1.6887
Epoch: 231, Loss: 1.6806
Epoch: 241, Loss: 1.6878
Epoch: 251, Loss: 1.6682
Epoch: 261, Loss: 1.6579
Epoch: 271, Loss: 1.6508
Epoch: 281, Loss: 1.6442
Epoch: 291, Loss: 1.6381
Epoch: 301, Loss: 1.6469
Epoch: 311, Loss: 1.6294
Epoch: 321, Loss: 1.6248
Epoch: 331, Loss: 1.6202
Epoch: 341, Loss: 1.6163
Epoch: 351, Loss: 1.6206
Epoch: 361, Loss: 1.6122
Epoch: 371, Loss: 1.6073
Epoch: 381, Loss: 1.6076
Epoch: 391, Loss: 1.6070
Epoch: 401,

#### Tanh Hidden channels = 32

In [55]:
np.random.seed(323)
model = GCN(32) # changing the hidden channels number to a value from [16,32,64]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc

total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.7397
Epoch: 11, Loss: 3.0289
Epoch: 21, Loss: 2.7654
Epoch: 31, Loss: 2.4860
Epoch: 41, Loss: 2.2601
Epoch: 51, Loss: 2.0983
Epoch: 61, Loss: 1.9686
Epoch: 71, Loss: 1.8724
Epoch: 81, Loss: 1.8058
Epoch: 91, Loss: 1.7501
Epoch: 101, Loss: 1.7081
Epoch: 111, Loss: 1.6736
Epoch: 121, Loss: 1.6520
Epoch: 131, Loss: 1.6279
Epoch: 141, Loss: 1.6097
Epoch: 151, Loss: 1.5942
Epoch: 161, Loss: 1.5806
Epoch: 171, Loss: 1.5856
Epoch: 181, Loss: 1.5614
Epoch: 191, Loss: 1.5488
Epoch: 201, Loss: 1.5414
Epoch: 211, Loss: 1.5317
Epoch: 221, Loss: 1.5239
Epoch: 231, Loss: 1.5165
Epoch: 241, Loss: 1.5098
Epoch: 251, Loss: 1.5046
Epoch: 261, Loss: 1.5025
Epoch: 271, Loss: 1.4942
Epoch: 281, Loss: 1.4892
Epoch: 291, Loss: 1.4849
Epoch: 301, Loss: 1.4800
Epoch: 311, Loss: 1.4764
Epoch: 321, Loss: 1.4791
Epoch: 331, Loss: 1.4710
Epoch: 341, Loss: 1.4670
Epoch: 351, Loss: 1.4628
Epoch: 361, Loss: 1.4597
Epoch: 371, Loss: 1.4724
Epoch: 381, Loss: 1.4548
Epoch: 391, Loss: 1.4529
Epoch: 401,

#### Tanh Hidden channels = 64

In [57]:
np.random.seed(323)
model = GCN(64) # changing the hidden channels number to a value from [16,32,64]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc

total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.7103
Epoch: 11, Loss: 2.7825
Epoch: 21, Loss: 2.3772
Epoch: 31, Loss: 2.0791
Epoch: 41, Loss: 1.9099
Epoch: 51, Loss: 1.7976
Epoch: 61, Loss: 1.7026
Epoch: 71, Loss: 1.6408
Epoch: 81, Loss: 1.6079
Epoch: 91, Loss: 1.5841
Epoch: 101, Loss: 1.5586
Epoch: 111, Loss: 1.5379
Epoch: 121, Loss: 1.5273
Epoch: 131, Loss: 1.5122
Epoch: 141, Loss: 1.4935
Epoch: 151, Loss: 1.4799
Epoch: 161, Loss: 1.4754
Epoch: 171, Loss: 1.4626
Epoch: 181, Loss: 1.4564
Epoch: 191, Loss: 1.4444
Epoch: 201, Loss: 1.4366
Epoch: 211, Loss: 1.4392
Epoch: 221, Loss: 1.4275
Epoch: 231, Loss: 1.4214
Epoch: 241, Loss: 1.4169
Epoch: 251, Loss: 1.4118
Epoch: 261, Loss: 1.4149
Epoch: 271, Loss: 1.4086
Epoch: 281, Loss: 1.4018
Epoch: 291, Loss: 1.3980
Epoch: 301, Loss: 1.3955
Epoch: 311, Loss: 1.3932
Epoch: 321, Loss: 1.3907
Epoch: 331, Loss: 1.3857
Epoch: 341, Loss: 1.3824
Epoch: 351, Loss: 1.3805
Epoch: 361, Loss: 1.3854
Epoch: 371, Loss: 1.3814
Epoch: 381, Loss: 1.3759
Epoch: 391, Loss: 1.3713
Epoch: 401,

### GCN with ReLU activation function and its' train_eval

In [42]:
class GCN_relu(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(5678)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels//2)
        self.classifier = Linear(hidden_channels//2, dataset.num_classes)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.relu()
        h = self.conv2(h, edge_index)
        h = h.relu()
        h = self.conv3(h, edge_index)
        h = h.relu()
        
        out = self.classifier(h)

        return out

#### ReLU Hidden channels = 16

In [50]:
np.random.seed(80)
model = GCN_relu(16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc


total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.6946
Epoch: 11, Loss: 3.3078
Epoch: 21, Loss: 3.0132
Epoch: 31, Loss: 2.8159
Epoch: 41, Loss: 2.5595
Epoch: 51, Loss: 2.3915
Epoch: 61, Loss: 2.2625
Epoch: 71, Loss: 2.1857
Epoch: 81, Loss: 2.1355
Epoch: 91, Loss: 2.0962
Epoch: 101, Loss: 2.0593
Epoch: 111, Loss: 2.0273
Epoch: 121, Loss: 1.9919
Epoch: 131, Loss: 1.9536
Epoch: 141, Loss: 1.9211
Epoch: 151, Loss: 1.8937
Epoch: 161, Loss: 1.8739
Epoch: 171, Loss: 1.8569
Epoch: 181, Loss: 1.8449
Epoch: 191, Loss: 1.8335
Epoch: 201, Loss: 1.8242
Epoch: 211, Loss: 1.8155
Epoch: 221, Loss: 1.8097
Epoch: 231, Loss: 1.8026
Epoch: 241, Loss: 1.7985
Epoch: 251, Loss: 1.7946
Epoch: 261, Loss: 1.7860
Epoch: 271, Loss: 1.7797
Epoch: 281, Loss: 1.7732
Epoch: 291, Loss: 1.7681
Epoch: 301, Loss: 1.7606
Epoch: 311, Loss: 1.7534
Epoch: 321, Loss: 1.7444
Epoch: 331, Loss: 1.7374
Epoch: 341, Loss: 1.7313
Epoch: 351, Loss: 1.7202
Epoch: 361, Loss: 1.7146
Epoch: 371, Loss: 1.7108
Epoch: 381, Loss: 1.7026
Epoch: 391, Loss: 1.6988
Epoch: 401,

#### ReLU Hidden channels = 32

In [51]:
np.random.seed(80)
model = GCN_relu(32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc


total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.7511
Epoch: 11, Loss: 3.1414
Epoch: 21, Loss: 2.7732
Epoch: 31, Loss: 2.5956
Epoch: 41, Loss: 2.4591
Epoch: 51, Loss: 2.2257
Epoch: 61, Loss: 2.1024
Epoch: 71, Loss: 1.9948
Epoch: 81, Loss: 1.9258
Epoch: 91, Loss: 1.8551
Epoch: 101, Loss: 1.8140
Epoch: 111, Loss: 1.7841
Epoch: 121, Loss: 1.7619
Epoch: 131, Loss: 1.7421
Epoch: 141, Loss: 1.7179
Epoch: 151, Loss: 1.7007
Epoch: 161, Loss: 1.6813
Epoch: 171, Loss: 1.6658
Epoch: 181, Loss: 1.6567
Epoch: 191, Loss: 1.6426
Epoch: 201, Loss: 1.6334
Epoch: 211, Loss: 1.6291
Epoch: 221, Loss: 1.6187
Epoch: 231, Loss: 1.6160
Epoch: 241, Loss: 1.6032
Epoch: 251, Loss: 1.5957
Epoch: 261, Loss: 1.5941
Epoch: 271, Loss: 1.5849
Epoch: 281, Loss: 1.5788
Epoch: 291, Loss: 1.5742
Epoch: 301, Loss: 1.5689
Epoch: 311, Loss: 1.5812
Epoch: 321, Loss: 1.5675
Epoch: 331, Loss: 1.5597
Epoch: 341, Loss: 1.5542
Epoch: 351, Loss: 1.5512
Epoch: 361, Loss: 1.5469
Epoch: 371, Loss: 1.5471
Epoch: 381, Loss: 1.5381
Epoch: 391, Loss: 1.5358
Epoch: 401,

#### ReLU Hidden channels = 64

In [52]:
np.random.seed(80)
model = GCN_relu(64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    a = data.y[data.train_mask].resize_(len(data.train_mask))
    loss = criterion(out[data.train_mask], a)
    loss.backward()
    optimizer.step()
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    b = data.y[data.val_mask].resize_(len(data.val_mask))
    test_correct = pred[data.val_mask] == b
    test_acc = int(test_correct.sum()) / len(data.val_mask)
    return test_acc


total_loss = []
for i, epoch in enumerate(range(1, 501)):
    loss = train()
    total_loss.append(loss)
    if i % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
val_acc = test()
print(f'Validation Accuracy: {val_acc:.4f}')
print(f'Minimum Loss Value: {loss:.4f}')

Epoch: 1, Loss: 3.6991
Epoch: 11, Loss: 2.9125
Epoch: 21, Loss: 2.4505
Epoch: 31, Loss: 2.2009
Epoch: 41, Loss: 2.0176
Epoch: 51, Loss: 1.8775
Epoch: 61, Loss: 1.8126
Epoch: 71, Loss: 1.7409
Epoch: 81, Loss: 1.7115
Epoch: 91, Loss: 1.6672
Epoch: 101, Loss: 1.6459
Epoch: 111, Loss: 1.6161
Epoch: 121, Loss: 1.5968
Epoch: 131, Loss: 1.5805
Epoch: 141, Loss: 1.5720
Epoch: 151, Loss: 1.5560
Epoch: 161, Loss: 1.5477
Epoch: 171, Loss: 1.5367
Epoch: 181, Loss: 1.5275
Epoch: 191, Loss: 1.5180
Epoch: 201, Loss: 1.5133
Epoch: 211, Loss: 1.5156
Epoch: 221, Loss: 1.5113
Epoch: 231, Loss: 1.4958
Epoch: 241, Loss: 1.4897
Epoch: 251, Loss: 1.4843
Epoch: 261, Loss: 1.4779
Epoch: 271, Loss: 1.4778
Epoch: 281, Loss: 1.4747
Epoch: 291, Loss: 1.4674
Epoch: 301, Loss: 1.4620
Epoch: 311, Loss: 1.4552
Epoch: 321, Loss: 1.4607
Epoch: 331, Loss: 1.4496
Epoch: 341, Loss: 1.4419
Epoch: 351, Loss: 1.4473
Epoch: 361, Loss: 1.4322
Epoch: 371, Loss: 1.4340
Epoch: 381, Loss: 1.4271
Epoch: 391, Loss: 1.4225
Epoch: 401,

In [46]:
best_model = model
best_model_weights = copy.deepcopy(model.state_dict())

### Saving best model

In [47]:
#torch.save(best_model, "GCN_best_model_new.pkl")

In [48]:
loaded_model = torch.load("GCN_best_model_new.pkl")
loaded_model
out_loaded = loaded_model(data.x, data.edge_index)
b = data.y[data.val_mask].resize_(len(data.val_mask))
pred_loaded = out_loaded.argmax(dim=1)
test_correct_loaded = pred_loaded[data.val_mask] == b
print(int(test_correct_loaded.sum()) / len(data.val_mask))

0.5751


### Create a dataframe for showcasing the prediction for each paper

In [49]:
from sklearn.metrics import accuracy_score
len(data.x)
idx = [i for i in range(len(data.x))]
real_full = data.y.resize_(len(data.x))
real_labels = real_full.numpy()
predicted_labels = pred.numpy()
print(accuracy_score(real_labels, predicted_labels))

results = pd.DataFrame(list(zip(idx, list(predicted_labels))),
                           columns=['idx', 'prediction'])

results
#results.to_csv('prediction.csv', index=False)

0.59665


Unnamed: 0,idx,prediction
0,0,4
1,1,9
2,2,28
3,3,2
4,4,27
...,...,...
99995,99995,26
99996,99996,5
99997,99997,18
99998,99998,30
