In [None]:
# Install required packages.
import os
import torch
import pickle

!pip install torch_geometric

In [2]:
#functions for saving and loading pickle files
def dump_pickle_file(filename,file):
  with open("/content/drive/MyDrive/thesis/data/"+filename+".pkl", "wb") as tf:
    pickle.dump(file,tf)

def load_pickle_file(filename):
  file_to_read = open("/content/drive/MyDrive/thesis/data/"+filename+".pkl", "rb")
  return pickle.load(file_to_read)

In [None]:
#Load dataset

from torch_geometric.data import Data
from torch_geometric.datasets import AttributedGraphDataset
import torch_geometric.transforms as T
import torch.nn.functional as F

name_data = 'BlogCatalog'
dataset = AttributedGraphDataset(root= '/tmp/' + name_data, name = name_data)

# dataset.transform = T.NormalizeFeatures()
print(f"Number of Classes in {name_data}:", dataset.num_classes)
print(f"Number of Node Features in {name_data}:", dataset.num_node_features)
data = dataset[0]

# **Build content graph**

In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_sim(A,B):
  cosine = np.dot(A,B)/(norm(A)*norm(B))
  return cosine

features = torch.detach(data.x).numpy()

In [None]:
def make_graph():
  sims= np.zeros((len(features),len(features)))
  for i,feature in enumerate(features):
    for j,feature2 in enumerate(features):
      if j<i:
        sims[i,j] = cosine_sim(feature,feature2)
    print(i)

In [None]:
def complete_graph():
  for i in range(len(features)):
    for j in range(len(features)):
      if j>i:
        sims[i,j]=sims[j,i]

In [None]:
# dump_pickle_file("cosine_similarity_BlogCatalog",sims)
sims= load_pickle_file("cosine_similarity_BlogCatalog")

edge_list2 = []
for i in range(sims.shape[0]):
  for j in range(sims.shape[1]):
    if i != j:
      if(sims[i][j])>0.2:
        edge_list2.append([i,j])

In [None]:
y = torch.detach(data.y).numpy()
keys= [x for x in range(len(y))]
y_dictionary = dict(zip(keys, y))
score_list= set(y)
dic_list= [[] for i in range(len(score_list))]

for item in y_dictionary:
  dic_list[y_dictionary[item]].append(item)

In [None]:
import random

l=[]
for sample_list in (dic_list):
  s_list = random.sample(sample_list, 20)
  l.extend(s_list)

train_mask = [False for i in range(len(data.x))]
for num in l:
  train_mask[num] = True
mylist = [x for x in range(len(data.x))]
mylist = [elt for elt in mylist if elt not in l]
l1 = random.sample(mylist, 500)
mylist = [elt for elt in mylist if elt not in l1]
val_mask = [False for i in range(len(data.x))]
for num in l1:
  val_mask[num] = True

l2 = random.sample(mylist, 1000)
test_mask = [False for i in range(len(data.x))]
for num in l2:
  test_mask[num] = True

data.train_mask = torch.tensor(train_mask, dtype=torch.bool)
data.test_mask = torch.tensor(test_mask, dtype=torch.bool)
data.val_mask = torch.tensor(val_mask, dtype=torch.bool)

# **GCN**

In [None]:
!pip install torcheval

import argparse
import os.path as osp
import torch
from torch.nn import Linear
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.logging import init_wandb, log
from torch_geometric.nn import GATv2Conv, GCNConv,GATConv
from torch.nn import Linear, Parameter
from torcheval.metrics.functional import multiclass_f1_score

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv1(x, edge_index).relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [None]:
epochs=200
hidden_channels= 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs


accuracies=[]
F1_scores=[]
for k in range(10):

  best_val_acc = final_test_acc = 0
  model = GCN(dataset.num_features, hidden_channels, dataset.num_classes)
  model, data = model.to(device), data.to(device)
  optimizer = torch.optim.Adam([
      dict(params=model.conv1.parameters(), weight_decay=5e-4),
      dict(params=model.conv2.parameters(), weight_decay=5e-4)
  ], lr=0.01)


  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score
      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc,F1_Score= F1_score)
  accuracies.append(test_acc)
  print(test_acc)
  print(F1_score)
  F1_scores.append(F1_score)
  print("*"*40)

# **AugSS-GCN**

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.lin= Linear(hidden_channels*2,hidden_channels,bias=False)
        self.w1 = torch.nn.Parameter(torch.ones(1).to(device), requires_grad=True)
        self.w2 = torch.nn.Parameter(torch.ones(1).to(device), requires_grad=True)

    def forward(self, x, edge_index,edge_index2):
        x = F.dropout(x, p=0.5, training=self.training)
        h1 = self.conv1(x, edge_index).relu()
        h2 = self.conv1(x, edge_index2).relu()
        # h3= torch.cat((h1,h2),1)
        # x3= self.lin(h3)
        # x3 = (h1+h2)
        x3 = torch.mul(self.w1,h1) +  torch.mul(self.w2,h2)
        x3 = F.dropout(x3, p=0.5, training=self.training)
        x = self.conv2(x3, edge_index)
        return x

In [None]:
epochs=200
hidden_channels= 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
edge_index2 = torch.tensor(edge_list2, dtype=torch.long)
edge_index2=edge_index2.t().contiguous()
edge_index2= edge_index2.to(device)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index,edge_index2)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index,edge_index2).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs

accuracies=[]
F1_scores=[]

for k in range(10):

  best_val_acc = final_test_acc = 0
  model = GCN(dataset.num_features, hidden_channels, dataset.num_classes)
  print(model)
  model, data = model.to(device), data.to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=5e-4)


  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()


  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score

      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
  print(test_acc)
  print(F1_score)
  accuracies.append(test_acc)
  F1_scores.append(F1_score)
  print("*"*40)

# **GAT**

In [None]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.conv1 = GATConv(in_channels, 8, heads=8, dropout=0.6)
        self.conv2 = GATConv(8 * 8, out_channels, heads=1, concat=False,
                             dropout=0.6)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)

In [None]:
epochs=200
hidden_channels= 8
heads= 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs


accuracies=[]
F1_scores=[]

for k in range(10):

  best_val_acc = final_test_acc = 0
  model = Net(dataset.num_features, dataset.num_classes)
  model, data = model.to(device), data.to(device)
  optimizer = torch.optim.Adam([
      dict(params=model.conv1.parameters(), weight_decay=5e-4),
      dict(params=model.conv2.parameters(), weight_decay=5e-4)
  ], lr=0.004)


  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score

      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
  print(test_acc)
  print(F1_score)
  accuracies.append(test_acc)
  F1_scores.append(F1_score)
  print("*"*40)

# **AugSS-GAT**

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1,concat=True, dropout=0.6)
        self.w1 = torch.nn.Parameter(torch.Tensor([0.8]).to(device), requires_grad=True)
        self.w2 = torch.nn.Parameter(torch.Tensor([0.8]).to(device), requires_grad=True)

    def forward(self, x, edge_index, edge_index2):
        x = F.dropout(x, p=0.6, training=self.training)
        h1 = F.elu(self.conv1(x, edge_index))
        h2= F.elu(self.conv1(x, edge_index2))
        # x3 = torch.mul(self.w1,h1) +  torch.mul(self.w2,h2)
        x3= (h1+h2)

        x = F.dropout(x3, p=0.6, training=self.training)
        x4 = self.conv2(x3, edge_index)
        return x4

In [None]:
hidden_channels= 8
heads= 8
epochs=200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

edge_index2 = torch.tensor(edge_list2, dtype=torch.long)
edge_index2=edge_index2.t().contiguous()
edge_index2= edge_index2.to(device)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index,edge_index2)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index,edge_index2).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs

accuracies=[]
F1_scores=[]

for k in range(10):
  best_val_acc = final_test_acc = 0
  model = GAT(dataset.num_features, hidden_channels, dataset.num_classes,heads).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.007, weight_decay=5e-4)

  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score

      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
  print(test_acc)
  print(F1_score)
  accuracies.append(test_acc)
  F1_scores.append(F1_score)
  print("*"*40)

# **GATv2**

In [None]:
class Net(torch.nn.Module):
    def __init__(self, in_channels,hidden_channels, out_channels,heads):
        super().__init__()

        self.conv1 = GATv2Conv(in_channels, hidden_channels, heads, dropout=0.6)
        self.conv2 = GATv2Conv(hidden_channels * heads, out_channels, heads=1, concat=False,
                             dropout=0.6)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=-1)

In [None]:
epochs=200
hidden_channels= 20
heads= 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs

accuracies=[]
F1_scores=[]

for k in range(10):

  best_val_acc = final_test_acc = 0
  model = Net(dataset.num_features,hidden_channels, dataset.num_classes,heads)
  model, data = model.to(device), data.to(device)
  print(model)
  optimizer = torch.optim.Adam([
      dict(params=model.conv1.parameters(), weight_decay=5e-4),
      dict(params=model.conv2.parameters(), weight_decay=5e-4)
  ], lr=0.002)


  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score

      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
  print(test_acc)
  print(F1_score)
  accuracies.append(test_acc)
  F1_scores.append(F1_score)
  print("*"*40)

# **AugSS-GATv2**

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super().__init__()
        self.conv1 = GATv2Conv(in_channels, hidden_channels, heads, dropout=0.6)
        self.conv2 = GATv2Conv(hidden_channels * heads, out_channels, heads=1,
                             concat=True, dropout=0.6)
        self.w1 = torch.nn.Parameter(torch.Tensor([0.8]).to(device), requires_grad=True)
        self.w2 = torch.nn.Parameter(torch.Tensor([0.8]).to(device), requires_grad=True)



    def forward(self, x, edge_index, edge_index2):
        x = F.dropout(x, p=0.6, training=self.training)
        h1 = F.elu(self.conv1(x, edge_index))
        h2= F.elu(self.conv1(x, edge_index2))
        # x3= (h1+h2)
        x3 = torch.mul(self.w1,h1) +  torch.mul(self.w2,h2)
        x = F.dropout(x3, p=0.6, training=self.training)
        x4 = self.conv2(x3, edge_index)
        return x4

In [None]:
hidden_channels= 8
heads= 8
epochs=200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

edge_index2 = torch.tensor(edge_list2, dtype=torch.long)
edge_index2=edge_index2.t().contiguous()
edge_index2= edge_index2.to(device)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index,edge_index2)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    out = model(data.x, data.edge_index,edge_index2)
    pred = out.argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    F1_score = multiclass_f1_score(pred[data.test_mask],  data.y[data.test_mask], num_classes=dataset.num_classes,average="macro")
    accs.append(F1_score)
    return accs

accuracies=[]
F1_scores=[]

for k in range(10):
  best_val_acc = final_test_acc = 0
  model = GAT(dataset.num_features, hidden_channels, dataset.num_classes,heads).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.007, weight_decay=5e-4)

  for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

  for epoch in range(1, epochs + 1):
      loss = train()
      train_acc, val_acc, tmp_test_acc,tmp_F1_score = test()
      if val_acc > best_val_acc:
          best_val_acc = val_acc
          test_acc = tmp_test_acc
          F1_score = tmp_F1_score

      log(Epoch=epoch, k=k,Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
  print(test_acc)
  print(F1_score)
  accuracies.append(test_acc)
  F1_scores.append(F1_score)
  print("*"*40)