In [1]:
import torch
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sentence_transformers import SentenceTransformer
import os
from tqdm.notebook import tqdm
from math import log
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
import nltk
import re
from hazm import stopwords_list, word_tokenize 
from hazm import *
import pandas
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from flair.embeddings import TransformerDocumentEmbeddings, TransformerWordEmbeddings
from flair.data import Sentence
from sklearn.metrics import accuracy_score, f1_score

In [2]:
import pickle
from tqdm import tqdm
from math import log

def write_pickle(path: str, data: list) -> None:
    """
    write_pickle function is written for write data in pickle file
    :param path:
    :param data:
    :return:
    """
    with open(path, "wb") as outfile:
        pickle.dump(data, outfile)


def read_pickle(path: str) -> list:
    """
    read_pickle function for  reading pickle file
    :param path:
    :return:
    """
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features

def create_features(sent_trans, sentences, train_size, test_size, tokenize_sentences, word_list, device):
    sent_embs = []
    word_embs = {}
    len_of_samples = len(tokenize_sentences)
    word_embedding_dict = {}

    for ind in tqdm(range(train_size + test_size)):

        sent = tokenize_sentences[ind]
        sent_embs.append(sent_trans.encode([sentences[ind]])[0])
        for word in sent:
            if word in word_embs:
                continue
            try:
                word_output = word_embedding_dict[word]
            except:
                word_output  = sent_trans.encode([word])[0]
                word_embedding_dict[word] = word_output
            word_embs[word] = word_output

        if ind % 100 == 0:
            print(ind, ' of ', len_of_samples, ' done...')
    print(len(word_embs))
    word_embs_list = []
    for word in word_list:
        word_embs_list.append(word_embs[word])

    features = sent_embs[:train_size] + word_embs_list + sent_embs[train_size:]
    features = preprocess_features(sp.csr_matrix(features)).todense()
    features = torch.FloatTensor(features).to(device)
    return features



def create_edge_attributes(edge, train_size, test_size, tokenize_sentences, word_id_map, word_list,
                           vocab_length):
    row = []
    col = []
    weight = []
    if edge >= 1:
        window_size = 20
        total_W = 0
        word_occurrence = {}
        word_pair_occurrence = {}

        def ordered_word_pair(a, b):
            if a > b:
                return b, a
            else:
                return a, b

        def update_word_and_word_pair_occurrence(q):
            unique_q = list(set(q))
            for i in unique_q:
                try:
                    word_occurrence[i] += 1
                except:
                    word_occurrence[i] = 1
            for i in range(len(unique_q)):
                for j in range(i + 1, len(unique_q)):
                    word1 = unique_q[i]
                    word2 = unique_q[j]
                    word1, word2 = ordered_word_pair(word1, word2)
                    try:
                        word_pair_occurrence[(word1, word2)] += 1
                    except:
                        word_pair_occurrence[(word1, word2)] = 1

        for ind in tqdm(range(train_size + test_size)):
            words = tokenize_sentences[ind]

            q = []
            # push the first (window_size) words into a queue
            for i in range(min(window_size, len(words))):
                q += [word_id_map[words[i]]]

            # update the total number of the sliding windows
            total_W += 1
            # update the number of sliding windows that contain each word and word pair

            update_word_and_word_pair_occurrence(q)

            now_next_word_index = window_size
            # pop the first word out and let the next word in, keep doing this until the end of the document
            while now_next_word_index < len(words):
                q.pop(0)
                q += [word_id_map[words[now_next_word_index]]]
                now_next_word_index += 1
                # update the total number of the sliding windows
                total_W += 1
                # update the number of sliding windows that contain each word and word pair
                update_word_and_word_pair_occurrence(q)

        for word_pair in word_pair_occurrence:
            i = word_pair[0]
            j = word_pair[1]
            count = word_pair_occurrence[word_pair]
            word_freq_i = word_occurrence[i]
            word_freq_j = word_occurrence[j]
            pmi = log((count * total_W) / (word_freq_i * word_freq_j))
            if pmi <= 0:
                continue
            row.append(train_size + i)
            col.append(train_size + j)
            weight.append(pmi)
            row.append(train_size + j)
            col.append(train_size + i)
            weight.append(pmi)
    # get each word appears in which document
    word_doc_list = {}
    for word in word_list:
        word_doc_list[word] = []

    for i in range(len(tokenize_sentences)):
        doc_words = tokenize_sentences[i]
        unique_words = set(doc_words)
        for word in unique_words:
            exsit_list = word_doc_list[word]
            exsit_list.append(i)
            word_doc_list[word] = exsit_list

    # document frequency
    word_doc_freq = {}
    for word, doc_list in word_doc_list.items():
        word_doc_freq[word] = len(doc_list)

    # term frequency
    doc_word_freq = {}

    for doc_id in range(len(tokenize_sentences)):
        words = tokenize_sentences[doc_id]
        for word in words:
            word_id = word_id_map[word]
            doc_word_str = str(doc_id) + ',' + str(word_id)
            if doc_word_str in doc_word_freq:
                doc_word_freq[doc_word_str] += 1
            else:
                doc_word_freq[doc_word_str] = 1

    for i in range(len(tokenize_sentences)):
        words = tokenize_sentences[i]
        doc_word_set = set()
        for word in words:
            if word in doc_word_set:
                continue
            j = word_id_map[word]
            key = str(i) + ',' + str(j)
            freq = doc_word_freq[key]
            if i < train_size:
                row.append(i)
            else:
                row.append(i + vocab_length)
            col.append(train_size + j)
            idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[word_list[j]])
            weight.append(freq * idf)
            doc_word_set.add(word)

    if edge >= 2:
        tokenize_sentences_set = [set(s) for s in tokenize_sentences]
        jaccard_threshold = 0.2
        for i in tqdm(range(len(tokenize_sentences))):
            for j in range(i + 1, len(tokenize_sentences)):
                jaccard_w = 1 - nltk.jaccard_distance(tokenize_sentences_set[i],
                                                      tokenize_sentences_set[j])
                if jaccard_w > jaccard_threshold:
                    if i < train_size:
                        row.append(i)
                    else:
                        row.append(i + vocab_length)
                    if j < train_size:
                        col.append(j)
                    else:
                        col.append(vocab_length + j)
                    weight.append(jaccard_w)
                    if j < train_size:
                        row.append(j)
                    else:
                        row.append(j + vocab_length)
                    if i < train_size:
                        col.append(i)
                    else:
                        col.append(vocab_length + i)
                    weight.append(jaccard_w)

    return row, col, weight


def generate_train_val(train_size, vocab_length, node_size, train_pro=0.9):
    real_train_size = int(train_pro * train_size)
    val_size = train_size - real_train_size

    idx_train = np.random.choice(train_size, real_train_size, replace=False)
    idx_train.sort()
    idx_val = []
    pointer = 0
    for v in range(train_size):
        if pointer < len(idx_train) and idx_train[pointer] == v:
            pointer += 1
        else:
            idx_val.append(v)
    idx_test = range(train_size + vocab_length, node_size)

    return idx_train, idx_val, idx_test

In [3]:
EDGE = 2 # 0:d2w 1:d2w+w2w 2:d2w+w2w+d2d
NODE = 1 # 0:one-hot #1:BERT 
NUM_LAYERS = 2 

HIDDEN_DIM = 200
DROP_OUT = 0.5
LR = 0.02
WEIGHT_DECAY = 0
EARLY_STOPPING = 10
NUM_EPOCHS = 200
#normalizer = Normalizer()
REMOVE_LIMIT = 20

In [None]:
digi_dataset = pandas.read_excel("digikala.xlsx")

In [None]:
digi_dataset = digi_dataset[['comment','recommend']]
print(digi_dataset.head())
digi_dataset = digi_dataset[(digi_dataset['recommend'] == "recommended") | (digi_dataset['recommend'] == "not_recommended") | (digi_dataset['recommend'] == "no_idea") ]
print(digi_dataset.shape)
digi_dataset = digi_dataset.reset_index(drop=True )
print(digi_dataset.head())

In [None]:
original_word_freq = {}
for sentence in list(digi_dataset["comment"]):
    word_list = word_tokenize(str(sentence))
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1   

print('done')

limited_sentences = []
limited_labels = []
word_list_dict = {}
for i,sentence in tqdm(enumerate(list(digi_dataset["comment"]))):
    word_list_temp = word_tokenize(str(sentence))
    doc_words = []
    for word in word_list_temp: 
        if word in original_word_freq and word not in stopwords_list() and original_word_freq[word] >= REMOVE_LIMIT:
            doc_words.append(word)
            word_list_dict[word] = 1
    if len(doc_words) > 0:
        limited_sentences.append(" ".join(doc_words))
        limited_labels.append(list(digi_dataset["recommend"])[i])

In [None]:
assert len(limited_sentences) == len(limited_labels)
digi_dataset = pandas.DataFrame({"comment":limited_sentences,"recommend":limited_labels})
digi_dataset.to_csv('digikala_data_remove20.csv', index=False)

In [None]:
# Load the pre-processed dataset

In [4]:
digi_dataset = pandas.read_csv(r"C:/Users/AGM1/F.Gh/Digikala-3Class.csv")
digi_dataset = digi_dataset.dropna()

In [5]:
digi_dataset.head()

Unnamed: 0,comment,recommend
0,سلام ، نظرم بگم میخواستم موضوع اشاره نظراتی کا...,recommended
1,گیره های فلزی سخت میشوند حوله سخت توان _x 000 ...,not_recommended
2,رابطه ظاهر گوشی . بدنه یکپارچه صفحه نمایش کیفی...,no_idea
3,ظرفیتش براتون کافیه حتما بخرید . _x 000 D_ یه ...,no_idea
4,سلام دوستان ، ، _x 000 D_ منم مثه دوستان خرید ...,recommended


In [6]:
nan_count = 0
for item in list(digi_dataset.comment):
     if type(item) is not str:
        print(item, '*' * 10,type(item))
        print('#' * 10)
        nan_count += 1
print(nan_count)

0


In [7]:
original_train, original_test = train_test_split(digi_dataset, test_size = 0.2)

original_train_sentences = list(original_train['comment'])
original_labels_train = list(original_train['recommend'])

original_test_sentences = list(original_test['comment'])
original_labels_test = list(original_test['recommend'])


train_size = len(original_train_sentences)
test_size = len(original_test_sentences)
sentences = original_train_sentences + original_test_sentences
print(train_size, test_size)

50331 12583


In [47]:
unique_labels=np.unique(original_labels_train)

num_class = len(unique_labels)
lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

train_labels = lEnc.transform(original_labels_train)
test_labels = lEnc.transform(original_labels_test)

device = torch.device('cpu')

labels = train_labels.tolist()+test_labels.tolist()
labels = torch.LongTensor(labels).to(device)

['no_idea' 'not_recommended' 'recommended']
[0 1 2]


In [48]:
print(len(labels))
print(len(train_labels))
print(len(test_labels))
print(len(sentences))

62914
50331
12583
62914


In [10]:
original_word_freq = {}  # to remove rare words
for sentence in sentences:
    word_list = str(sentence).split()
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1   

print('done')

tokenize_sentences = []
word_list_dict = {}
for i,sentence in tqdm(enumerate(sentences)):
    word_list_temp = str(sentence).split()
    doc_words = []
    for word in word_list_temp: 
        if word in original_word_freq:
            doc_words.append(word)
            word_list_dict[word] = 1
    tokenize_sentences.append(doc_words)

word_list = list(word_list_dict.keys())
vocab_length = len(word_list)


word_id_map = {}
for i in range(vocab_length):
    word_id_map[word_list[i]] = i   

done


62914it [00:00, 98890.63it/s] 


In [11]:
node_size = train_size + vocab_length + test_size
print(node_size)

70362


In [12]:
create_edge_attributes_path = "C:/Users/AGM1/F.Gh/attribute_Digikala-3Class.pickle"

if os.path.exists(create_edge_attributes_path):
    print('loading attr from local ...')
    ATTR = read_pickle(create_edge_attributes_path)
    row, col, weight = ATTR[0], ATTR[1], ATTR[2]
else:
    row, col, weight = create_edge_attributes(edge=EDGE,
                                              train_size=train_size,
                                              test_size=test_size,
                                              tokenize_sentences=tokenize_sentences,
                                              word_list=word_list,
                                              word_id_map=word_id_map,
                                              vocab_length=vocab_length)
    write_pickle(create_edge_attributes_path, data=[row, col, weight])

loading attr from local ...


In [None]:
Parsbert_PATH = 'C:/Users/Downloads/Parsbert'
Parsbert_MODEL = SentenceTransformer(Parsbert_PATH)

In [13]:
features_saved_path = "C:/Users/AGM1/F.Gh/features_Digikala-3Class.pickle"

if os.path.exists(features_saved_path):
    print('loading features from local ...')
    features = read_pickle(features_saved_path)

else:

    features = create_features(sent_trans  = Parsbert_MODEL,
                                        sentences  = sentences,
                                        train_size = train_size, 
                                        test_size  =  test_size,
                                        tokenize_sentences=str(tokenize_sentences), 
                                        word_list=word_list, 
                                        device=device)
    write_pickle(features_saved_path, data=features)

loading features from local ...


In [14]:
print(features.shape)
index = 123
print(row[index], col[index], weight[index])
print(len(row))
print(len(col))
print(len(weight))

torch.Size([70395, 512])
50372 50364 0.8472357834625874
14685859
14685859
14685859


In [15]:
edge_index = [row, col]
edge_attr = weight
edge_index = torch.tensor(edge_index, dtype=torch.long)
print(edge_index.shape)
edge_attr = torch.FloatTensor(edge_attr)
edge_attr = edge_attr.reshape((len(weight), 1))
print(edge_attr.shape)

torch.Size([2, 14685859])
torch.Size([14685859, 1])


In [16]:
data = Data(x=features, edge_index=edge_index, edge_attr=edge_attr)
print(data)

Data(x=[70395, 512], edge_index=[2, 14685859], edge_attr=[14685859, 1])


In [17]:
node_size = train_size + vocab_length + test_size
node_size

70362

In [18]:
idx_train, idx_val, idx_test = generate_train_val(train_size, vocab_length, node_size, train_pro=0.9)

### ClusterData

In [94]:
train_mask = [0] * node_size
for item in idx_train:
    train_mask[item] = 1

val_mask = [0] * node_size
for item in idx_val:
    val_mask[item] = 1

test_mask = [0] * node_size
for item in idx_test:
    test_mask[item] = 1

In [95]:
data = Data(x=features, edge_index=edge_index, edge_attr=edge_attr, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

Data(x=[70395, 512], edge_index=[2, 14685859], edge_attr=[14685859, 1], train_mask=[70362], val_mask=[70362], test_mask=[70362])


In [None]:
# Using ClusterData and graph partitioning with METIS

In [96]:
from torch_geometric.loader import ClusterData
loader = ClusterData(data,num_parts = 4)

Computing METIS partitioning...
Done!


In [97]:
data = []
for i in loader:
    data.append(i)

In [98]:
data[0]

Data(x=[6834, 512], edge_attr=[179231, 1], train_mask=[70362], val_mask=[70362], test_mask=[70362], edge_index=[2, 179231])

In [99]:
import torch.nn.functional as F
from torch.nn import Linear, Dropout
from torch_geometric.nn import  GATConv, GCNConv, GINConv

In [None]:
# GCN

In [100]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 300)     # data.num_node_features = 512
        self.conv2 = GCNConv(300, 100)
        self.linear = torch.nn.Linear(100, 3)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        return F.log_softmax(x, dim=1)

In [103]:
model = GCN()
model

GCN(
  (conv1): GCNConv(512, 300)
  (conv2): GCNConv(300, 100)
  (linear): Linear(in_features=100, out_features=3, bias=True)
)

In [None]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return (pred_y == y).sum().item() / len(y)

def train(model, optimizer, loader, labels, device):
    model.train()
    total_loss=0
    train_loss=0
    train_acc=0
    val_loss=0
    val_acc=0
    
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch) 
        
        # Training
        train_loss = F.nll_loss(out[batch.train_mask], labels[batch.train_mask])
        train_acc  = accuracy(out[batch.train_mask].argmax(dim=1), labels[batch.train_mask])
        
        train_loss.backward()
        optimizer.step()
        
        # Validation
        val_loss = F.nll_loss(out[batch.val_mask], labels[batch.val_mask])
        val_acc  = accuracy(out[batch.val_mask].argmax(dim=1), labels[batch.val_mask])
        
        #==========================Loss=================================
        train_loss += train_loss.item() 
        train_acc += train_acc
        val_loss += val_loss.item()    
        val_acc += val_acc
        
    train_loss = train_loss/len(loader)
    train_acc  = train_acc/len(loader)
    val_loss = val_acc/len(loader)
    val_acc = val_acc / len(loader)
# #     print('val_total_acc:', val_total_acc)
# #     print('train_total_acc:', train_total_acc)
# #     print("val_total_nodes:", val_total_nodes)
# #     print("train_total_nodes:", train_total_nodes)
    return train_loss, train_acc, val_loss, val_acc


In [None]:
# Print metrics every 10 epochs
device = 'cuda'

model = GCN().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for epoch in range(1000):
    if(epoch % 10 == 0):
        train_loss,val_loss, val_acc, train_acc= train (model, optimizer, loader, labels, device)
        print(f'Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train Acc:'
                  f' {train_acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
        print('___________________________________________________________________________________________________')

In [None]:
best_acc = 0
if train_acc >= best_acc:
        best_acc = train_acc
torch.save(model, 'digi_best_model_gcn.pt')
print('best saved model is :', best_acc)

In [None]:
# Gin

In [None]:
class GIN(torch.nn.Module):
    def __init__(self):
        super(GIN, self).__init__()
        self.conv1 = GINConv(nn.Sequential(nn.Linear(data.num_node_features, 300), nn.ReLU(), nn.Linear(300, 300)))
        self.conv2 = GINConv(nn.Sequential(nn.Linear(300, 100), nn.ReLU(), nn.Linear(100, 100)))
        self.linear = nn.Linear(100, 3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        return F.log_softmax(x, dim=1)


In [None]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return (pred_y == y).sum().item() / len(y)

def train(model, optimizer, loader, labels, device):
    model.train()
    total_loss=0
    train_loss=0
    train_acc=0
    val_loss=0
    val_acc=0
    
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch) 
        
        # Training
        train_loss = F.nll_loss(out[batch.train_mask], labels[batch.train_mask])
        train_acc  = accuracy(out[batch.train_mask].argmax(dim=1), labels[batch.train_mask])
        
        train_loss.backward()
        optimizer.step()
        
        # Validation
        val_loss = F.nll_loss(out[batch.val_mask], labels[batch.val_mask])
        val_acc  = accuracy(out[batch.val_mask].argmax(dim=1), labels[batch.val_mask])
        
        #==========================Loss=================================
        train_loss += train_loss.item() 
        train_acc += train_acc
        val_loss += val_loss.item()    
        val_acc += val_acc
        
    train_loss = train_loss/len(loader)
    train_acc  = train_acc/len(loader)
    val_loss = val_acc/len(loader)
    val_acc = val_acc / len(loader)
# #     print('val_total_acc:', val_total_acc)
# #     print('train_total_acc:', train_total_acc)
# #     print("val_total_nodes:", val_total_nodes)
# #     print("train_total_nodes:", train_total_nodes)
    return train_loss, train_acc, val_loss, val_acc


In [None]:
# Print metrics every 10 epochs
device = 'cuda'

model = GIN().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for epoch in range(1000):
    if(epoch % 10 == 0):
        train_loss,val_loss, val_acc, train_acc= train (model, optimizer, loader, labels, device)
        print(f'Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train Acc:'
                  f' {train_acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
        print('___________________________________________________________________________________________________')

In [None]:
best_acc = 0
if train_acc >= best_acc:
        best_acc = train_acc
torch.save(model, 'digi_best_model_gin.pt')
print('best saved model is :', best_acc)

In [None]:
#GAT

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, num_heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(data.num_node_features, 300, heads=num_heads)
        self.conv2 = GATConv(300 * num_heads, 100, heads=num_heads)
        self.linear = nn.Linear(100 * num_heads, 3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        return F.log_softmax(x, dim=1)


In [None]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return (pred_y == y).sum().item() / len(y)

def train(model, optimizer, loader, labels, device):
    model.train()
    total_loss=0
    train_loss=0
    train_acc=0
    val_loss=0
    val_acc=0
    
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch) 
        
        # Training
        train_loss = F.nll_loss(out[batch.train_mask], labels[batch.train_mask])
        train_acc  = accuracy(out[batch.train_mask].argmax(dim=1), labels[batch.train_mask])
        
        train_loss.backward()
        optimizer.step()
        
        # Validation
        val_loss = F.nll_loss(out[batch.val_mask], labels[batch.val_mask])
        val_acc  = accuracy(out[batch.val_mask].argmax(dim=1), labels[batch.val_mask])
        
        #==========================Loss=================================
        train_loss += train_loss.item() 
        train_acc += train_acc
        val_loss += val_loss.item()    
        val_acc += val_acc
        
    train_loss = train_loss/len(loader)
    train_acc  = train_acc/len(loader)
    val_loss = val_acc/len(loader)
    val_acc = val_acc / len(loader)
# #     print('val_total_acc:', val_total_acc)
# #     print('train_total_acc:', train_total_acc)
# #     print("val_total_nodes:", val_total_nodes)
# #     print("train_total_nodes:", train_total_nodes)
    return train_loss, train_acc, val_loss, val_acc


In [None]:
# Print metrics every 10 epochs
device = 'cuda'

model = GAT().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for epoch in range(1000):
    if(epoch % 10 == 0):
        train_loss,val_loss, val_acc, train_acc= train (model, optimizer, loader, labels, device)
        print(f'Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train Acc:'
                  f' {train_acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
        print('___________________________________________________________________________________________________')

In [None]:
best_acc = 0
if train_acc >= best_acc:
        best_acc = train_acc
torch.save(model, 'digi_best_model_gat.pt')
print('best saved model is :', best_acc)

In [None]:
#GCN2

In [None]:
class GCN2(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 300)     # data.num_node_features = 512
        self.conv2 = GCNConv(300, 100)
        self.linear = torch.nn.Linear(100, 3)
        
    def forward(self, data):
        x, edge_index = data[3].x, data[0].edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.linear(x)
        return F.log_softmax(x, dim=1)

In [None]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return (pred_y == y).sum().item() / len(y)

def train(model, optimizer, loader, labels, device):
    model.train()
    total_loss=0
    train_loss=0
    train_acc=0
    val_loss=0
    val_acc=0
    
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch) 
        
        # Training
        train_loss = F.nll_loss(out[batch.train_mask], labels[batch.train_mask])
        train_acc  = accuracy(out[batch.train_mask].argmax(dim=1), labels[batch.train_mask])
        
        train_loss.backward()
        optimizer.step()
        
        # Validation
        val_loss = F.nll_loss(out[batch.val_mask], labels[batch.val_mask])
        val_acc  = accuracy(out[batch.val_mask].argmax(dim=1), labels[batch.val_mask])
        
        #==========================Loss=================================
        train_loss += train_loss.item() 
        train_acc += train_acc
        val_loss += val_loss.item()    
        val_acc += val_acc
        
    train_loss = train_loss/len(loader)
    train_acc  = train_acc/len(loader)
    val_loss = val_acc/len(loader)
    val_acc = val_acc / len(loader)
# #     print('val_total_acc:', val_total_acc)
# #     print('train_total_acc:', train_total_acc)
# #     print("val_total_nodes:", val_total_nodes)
# #     print("train_total_nodes:", train_total_nodes)
    return train_loss, train_acc, val_loss, val_acc


In [None]:
# Print metrics every 10 epochs
device = 'cuda'

model = GCN2().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for epoch in range(1000):
    if(epoch % 10 == 0):
        train_loss,val_loss, val_acc, train_acc= train (model, optimizer, loader, labels, device)
        print(f'Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train Acc:'
                  f' {train_acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
        print('___________________________________________________________________________________________________')

In [None]:
best_acc = 0
if train_acc >= best_acc:
        best_acc = train_acc
torch.save(model, 'digi_best_model_gcn2.pt')
print('best saved model is :', best_acc)

In [None]:
# Ensemble Learning

In [None]:
model_gcn = torch.load('digi_best_model_gcn.pt')
model_gin = torch.load('digi_best_model_gin.pt')
model_gat = torch.load('digi_best_model_gat.pt')
model_gcn2 = torch.load('digi_best_model_gcn2.pt')

model_gcn.eval()
model_gin.eval()
model_gat.eval()
model_gcn2.eval()

acc_gcn = 0
acc_gin = 0
acc_gat = 0
acc_gcn2 = 0
acc = 0
for batch in loader:
       
    batch = batch.to(device)
       
    optimizer.zero_grad()
    out_gcn = model_gcn(batch)
    out_gin = model_gin(batch)
    out_gat = model_gat(batch)
    out_gcn2 = model_gcn2(batch)
    out = (out_gcn + out_gin + out_gat + out_gcn2)/4
       
    acc_gcn += accuracy(out_gcn[batch.test_mask].argmax(dim=1),labels[batch.test_mask])/ len(loader)
    acc_gin += accuracy(out_gin[batch.test_mask].argmax(dim=1),labels[batch.test_mask])/ len(loader)
    acc_gat += accuracy(out_gat[batch.test_mask].argmax(dim=1),labels[batch.test_mask])/ len(loader)
    acc_gcn2 += accuracy(out_gcn2[batch.test_mask].argmax(dim=1),labels[batch.test_mask])/ len(loader)
    acc_predictions = out[batch.test_mask].argmax(dim=1).cpu()
    acc_labels = labels[batch.test_mask].cpu()
   
    acc += accuracy(acc_labels, acc_predictions)/ len(loader)
   
    precision += precision_score(acc_labels, acc_predictions, average='weighted')/ len(loader)
    recall += recall_score(acc_labels, acc_predictions, average='weighted')/ len(loader)
    f1 += f1_score(acc_labels, acc_predictions, average='weighted')/ len(loader)
       

# Print results
print(f'GCN accuracy:     {acc_gcn*100:.2f}%')
print(f'GAT accuracy:     {acc_gat*100:.2f}%')
print(f'GIN accuracy:     {acc_gat*100:.2f}%')
print(f'GCN2 accuracy:     {acc_gat*100:.2f}%')

print(f'Overall accuracy: {acc*100:.2f}%')
print(f'Precision:         {precision*100:.2f}%')
print(f'Recall:            {recall*100:.2f}%')
print(f'F1-score:          {f1*100:.2f}%')