### Mount drive 
##### (If using google colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/DFS-graph_classification/
%ls

### Code for create graph and sequence 
##### (only need to run **once** to create graph and sequence for the dataset; if they already exists or created, **no need** to create again; if you wish to overwrite with new, you can run this again, and respond Y to delete existing)
##### Need to set input arguments as desired for data processing in args.py

In [None]:
!bash build.sh

In [None]:
import random
import time
import pickle
import os
import json
from args import Args
from utils import create_dirs
from datasets.process_dataset import create_graphs
from datasets.process_sequence import create_sequences
from datasets.preprocess import calc_max_prev_node, dfscodes_weights
seed = 7

if __name__ == '__main__':
    args = Args()
    args = args.update_args()

    create_dirs(args)
    
    random.seed(seed)

    #Create graphs and its min dfs code
    graphs, create_graphs_time, dfscode_time = create_graphs(args)
   
    
    print('Graph type:', args.graph_type)

    # Loading the feature map
    with open(args.current_dataset_path + 'label_0/map.dict', 'rb') as f:
        feature_map = pickle.load(f)
    print('\n\nInformation about Graphs of class 0: \n')
    print('Max number of nodes: {}'.format(feature_map['max_nodes']))
    print('Max number of edges: {}'.format(feature_map['max_edges']))
    print('Min number of nodes: {}'.format(feature_map['min_nodes']))
    print('Min number of edges: {}'.format(feature_map['min_edges']))
    print('Max degree of a node: {}'.format(feature_map['max_degree']))
    print('No. of node labels: {}'.format(len(feature_map['node_forward'])))
    print('No. of edge labels: {}'.format(len(feature_map['edge_forward'])))


    with open(args.current_dataset_path + 'label_1/map.dict', 'rb') as f:
        feature_map = pickle.load(f)
    print('\n\nInformation about Graphs of class 1: \n')
    print('Max number of nodes: {}'.format(feature_map['max_nodes']))
    print('Max number of edges: {}'.format(feature_map['max_edges']))
    print('Min number of nodes: {}'.format(feature_map['min_nodes']))
    print('Min number of edges: {}'.format(feature_map['min_edges']))
    print('Max degree of a node: {}'.format(feature_map['max_degree']))
    print('No. of node labels: {}'.format(len(feature_map['node_forward'])))
    print('No. of edge labels: {}'.format(len(feature_map['edge_forward'])))

    with open(args.current_dataset_path + 'all_graphs/map.dict', 'rb') as f:
        feature_map = pickle.load(f)
    print('\n\nInformation about All the Graphs: \n')
    print('Max number of nodes: {}'.format(feature_map['max_nodes']))
    print('Max number of edges: {}'.format(feature_map['max_edges']))
    print('Min number of nodes: {}'.format(feature_map['min_nodes']))
    print('Min number of edges: {}'.format(feature_map['min_edges']))
    print('Max degree of a node: {}'.format(feature_map['max_degree']))
    print('No. of node labels: {}'.format(len(feature_map['node_forward'])))
    print('No. of edge labels: {}'.format(len(feature_map['edge_forward'])))

    #Create equences of min dfs code
    create_sequences_time = create_sequences(args)

    param_time = {"create_graphs" : create_graphs_time,
                  "dfscode": dfscode_time,
                  "create_sequence": create_sequences_time}
                  
    with open ('datasets/'+args.graph_type+'/time.txt', 'w') as f:
        f.write(json.dumps(param_time, indent=2))

### Import libraries

In [None]:
import numpy as np
import json
import time
import os
import pandas as pd

# from bow import Vocabulary
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM, GRU, Bidirectional
import keras
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score
import random
import csv
seed = 7

np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

### Preprocess data

#### Settings

In [None]:
!mkdir results

In [None]:
dataset = 'MUTAG' #dataset name you wish to test on

In [None]:
!mkdir results/{dataset}

In [None]:
edge_type = 'normal' #'normal' for orginal edge labels and 'default' for default edge labels

In [None]:
!mkdir results/{dataset}/{edge_type}

In [None]:
time_type = 'w' #'w' for with timestamp (min dfs code) and 'wo' for without timestamp (min dfs code variant)

In [None]:
!mkdir results/{dataset}/{edge_type}/{time_type}

#### Function to create bag of words

In [None]:
from collections import Counter
import json
import numpy as np
 
class Vocabulary:
 
    def __init__(self, vocabulary, wordFrequencyFilePath):
        self.vocabulary = vocabulary
        self.WORD_FREQUENCY_FILE_FULL_PATH = wordFrequencyFilePath
        self.input_word_index = {}
        self.reverse_input_word_index = {}
 
        self.MaxSentenceLength = None
 
    def PrepareVocabulary(self, reviews):
        self._prepare_Word_Frequency_Count_File(reviews)
        self._create_Vocab_Indexes()
 
        self.MaxSentenceLength = max([len(txt.split(" ")) for txt in reviews])
 
    def Get_Top_Words(self, number_words=None):
        if number_words == None:
            number_words = self.vocabulary
 
        chars = json.loads(open(self.WORD_FREQUENCY_FILE_FULL_PATH).read())
        counter = Counter(chars)
        most_popular_words = {key for key,
                              _value in counter.most_common(number_words)}
        return most_popular_words
 
    def _prepare_Word_Frequency_Count_File(self, reviews):
        counter = Counter()
        for s in reviews:
            counter.update(s.split(" "))
 
        with open(self.WORD_FREQUENCY_FILE_FULL_PATH, 'w') as output_file:
            output_file.write(json.dumps(counter))
 
    def _create_Vocab_Indexes(self):
        INPUT_WORDS = self.Get_Top_Words(self.vocabulary)
 
        for i, word in enumerate(INPUT_WORDS):
            self.input_word_index[word] = i
 
        for word, i in self.input_word_index.items():
            self.reverse_input_word_index[i] = word
 
    def TransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))
 
            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
 
            vectors.append(vector)
 
        return vectors
 
    def ReverseTransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))
 
            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
                    # vector[t] = 2 #unk
            vectors.append(vector)
 
        return vectors

#### with timestamp

In [None]:
if time_type == 'w' and edge_type == 'normal':
  path = 'datasets/'+dataset+'/sequences/with_timestamp/'

  with open(path + 'label_1/with_timestamp_sequence_label_1.txt', 'r') as f:
      reviews_positive = json.loads(f.read())

  with open(path + 'label_0/with_timestamp_sequence_label_0.txt', 'r') as f:
      reviews_negative = json.loads(f.read())

  # print(reviews_negative)
  if dataset == 'MUTAG':
      TOP_WORDS = 372
  elif dataset == 'PTC_FR':
      TOP_WORDS = 1253
  elif dataset == 'NCI-H23':
      TOP_WORDS = 7260
  elif dataset == 'TOX21_AR':
      TOP_WORDS = 8780
  elif dataset == 'DBLP_v1':
      TOP_WORDS = 58184
  else:
      TOP_WORDS = 100000

  print(dataset)
  print("TOP WORDS: ",TOP_WORDS)

  Reviews_Labeled = list(zip(reviews_positive, np.ones(len(reviews_positive))))
  Reviews_Labeled.extend(
      list(zip(reviews_negative, np.zeros(len(reviews_negative)))))
  random.seed(7)
  random.shuffle(Reviews_Labeled)
  # print(Reviews_Labeled)
  print(Reviews_Labeled)
  vocab = Vocabulary(TOP_WORDS, path + "analysis.vocab")

  reviews_text = [line[0] for line in Reviews_Labeled]
  vocab.PrepareVocabulary(reviews_text)

  with open(path + "analysis.vocab") as f:
      data = json.load(f)
      print('Total number of words: ' + str(len(data)))

  reviews, labels = zip(*Reviews_Labeled)
  reviews_int = vocab.TransformSentencesToId(reviews)

  X = np.array(reviews_int, dtype=object)
  max_len = max([len(i) for i in X])
  print("Maximum length :", max_len)
  max_review_length = max_len
  X = sequence.pad_sequences(X, maxlen=max_review_length)
  Y = np.array(labels)
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


In [None]:
if time_type == 'w' and edge_type == 'default':
  path = '/content/drive/MyDrive/NTU_graph_classification/datasets/'+dataset+'/default/sequences-default/with_timestamp/'

  with open(path + 'label_1/with_timestamp_sequence_label_1.txt', 'r') as f:
      reviews_positive = json.loads(f.read())

  with open(path + 'label_0/with_timestamp_sequence_label_0.txt', 'r') as f:
      reviews_negative = json.loads(f.read())

  # print(reviews_negative)
  if dataset == 'MUTAG':
      TOP_WORDS = 300
  elif dataset == 'PTC_FR':
      TOP_WORDS = 1018
  elif dataset == 'NCI-H23':
      TOP_WORDS = 4913
  elif dataset == 'IMDB-BINARY':
    TOP_WORDS = 23415
  elif dataset == 'DBLP_v1':
      TOP_WORDS = 57552
  else:
      TOP_WORDS = 100000

  print(dataset)
  print("TOP WORDS: ",TOP_WORDS)

  Reviews_Labeled = list(zip(reviews_positive, np.ones(len(reviews_positive))))
  Reviews_Labeled.extend(
      list(zip(reviews_negative, np.zeros(len(reviews_negative)))))
  random.seed(7)
  random.shuffle(Reviews_Labeled)
  print(Reviews_Labeled)
  vocab = Vocabulary(TOP_WORDS, path + "analysis.vocab")

  reviews_text = [line[0] for line in Reviews_Labeled]
  vocab.PrepareVocabulary(reviews_text)

  with open(path + "analysis.vocab") as f:
      data = json.load(f)
      print('Total number of words: ' + str(len(data)))

  reviews, labels = zip(*Reviews_Labeled)
  reviews_int = vocab.TransformSentencesToId(reviews)

  X = np.array(reviews_int, dtype=object)
  max_len = max([len(i) for i in X])
  print("Maximum length :", max_len)
  max_review_length = max_len
  X = sequence.pad_sequences(X, maxlen=max_review_length)
  Y = np.array(labels)
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


In [None]:
if time_type == 'w': 
  # print(kfold)
  fold_index = {"train": [], "test": []}
  for train_index, test_index in kfold.split(X,Y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    fold_index["train"].append(train_index.tolist())
    fold_index["test"].append(test_index.tolist())

  print(fold_index['test'])
  with open ('results/'+dataset+'/'+edge_type+'/w/indexes.txt', 'w') as f:
      f.write(json.dumps(fold_index))

#### without timestamp

In [None]:
if time_type == 'wo' and edge_type == 'normal':
  path = 'datasets/'+dataset+'/sequences/without_timestamp/'

  with open(path + 'label_1/without_timestamp_sequence_label_1.txt', 'r') as f:
      reviews_positive = json.loads(f.read())
  with open(path + 'label_0/without_timestamp_sequence_label_0.txt', 'r') as f:
      reviews_negative = json.loads(f.read())

  if dataset == 'MUTAG':
      TOP_WORDS = 23
  elif dataset == 'PTC_FR':
      TOP_WORDS = 70
  elif dataset == 'NCI-H23':
      TOP_WORDS = 80
  elif dataset == 'TOX21_AR':
      TOP_WORDS = 178
  elif dataset == 'DBLP_v1':
      TOP_WORDS = 47295
  else :
      TOP_WORDS = 2000

  print(dataset)
  print("TOP WORDS: ",TOP_WORDS)

  Reviews_Labeled = list(zip(reviews_positive, np.ones(len(reviews_positive))))
  Reviews_Labeled.extend(
      list(zip(reviews_negative, np.zeros(len(reviews_negative)))))
  random.seed(7)
  random.shuffle(Reviews_Labeled)
  print(Reviews_Labeled)
  vocab = Vocabulary(TOP_WORDS, path + "analysis.vocab")

  reviews_text = [line[0] for line in Reviews_Labeled]
  vocab.PrepareVocabulary(reviews_text)

  with open(path + "analysis.vocab") as f:
      data = json.load(f)
      print('Total number of words: ' + str(len(data)))

  reviews, labels = zip(*Reviews_Labeled)
  reviews_int = vocab.TransformSentencesToId(reviews)

  X = np.array(reviews_int, dtype=object)
  max_len = max([len(i) for i in X])
  print("Maximum length :", max_len)
  max_review_length = max_len
  X = sequence.pad_sequences(X, maxlen=max_review_length)
  Y = np.array(labels)
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


In [None]:
if time_type == 'wo' and edge_type == 'default': 
  path = '/content/drive/MyDrive/NTU_graph_classification/datasets/'+dataset+'/default/sequences-default/without_timestamp/'

  with open(path + 'label_1/without_timestamp_sequence_label_1.txt', 'r') as f:
      reviews_positive = json.loads(f.read())
      # print(reviews_positive)
  with open(path + 'label_0/without_timestamp_sequence_label_0.txt', 'r') as f:
      reviews_negative = json.loads(f.read())

  # print(reviews_negative)
  if dataset == 'MUTAG':
      TOP_WORDS = 12
  elif dataset == 'PTC_FR':
      TOP_WORDS = 41
  elif dataset == 'NCI-H23':
      TOP_WORDS = 54
  elif dataset == 'IMDB-BINARY':
      TOP_WORDS = 1615
  elif dataset == 'DBLP_v1':
      TOP_WORDS = 47217
  else :
      TOP_WORDS = 2000

  print(dataset)
  print("TOP WORDS: ",TOP_WORDS)

  Reviews_Labeled = list(zip(reviews_positive, np.ones(len(reviews_positive))))
  Reviews_Labeled.extend(
      list(zip(reviews_negative, np.zeros(len(reviews_negative)))))
  random.seed(7)
  random.shuffle(Reviews_Labeled)
  print(Reviews_Labeled)
  vocab = Vocabulary(TOP_WORDS, path + "analysis.vocab")

  reviews_text = [line[0] for line in Reviews_Labeled]
  vocab.PrepareVocabulary(reviews_text)

  with open(path + "analysis.vocab") as f:
      data = json.load(f)
      print('Total number of words: ' + str(len(data)))

  reviews, labels = zip(*Reviews_Labeled)
  reviews_int = vocab.TransformSentencesToId(reviews)

  X = np.array(reviews_int, dtype=object)
  max_len = max([len(i) for i in X])
  print("Maximum length :", max_len)
  max_review_length = max_len
  X = sequence.pad_sequences(X, maxlen=max_review_length)
  Y = np.array(labels)
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


In [None]:
if time_type == 'wo':
  fold_index = {"train": [], "test": []}
  for train_index, test_index in kfold.split(X,Y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    fold_index["train"].append(train_index.tolist())
    fold_index["test"].append(test_index.tolist())

  print(fold_index['test'])
  with open ('results/'+dataset+'/'+edge_type+'/wo/indexes.txt', 'w') as f:
      f.write(json.dumps(fold_index))

### CallBack function


In [None]:

# define your custom callback for prediction
class PredictionCallback(tf.keras.callbacks.Callback): 
    def __init__(self, valx, file):
        super().__init__()
        self.validation_data = valx
        self.filename = file
        # self.batch_size = batch_size

    def on_train_begin(self, logs={}):
        with open(filename, 'w', newline='') as file:
          writer = csv.writer(file)
          writer.writerow(["roc", "prc", "train loss", "train acc", "val loss", "val acc", "epoch time"])

    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs={}):
        
        self.epoch_time_end = time.time() - self.epoch_time_start
        yhat_probs = self.model.predict(self.validation_data[0])
    
        yhat_probs = yhat_probs[:, 0]
          
        roc_auc = metrics.roc_auc_score(self.validation_data[1], yhat_probs, average='macro')
        print('ROC AUC: %f' % roc_auc)
        
        prc_auc = metrics.average_precision_score(
            self.validation_data[1], yhat_probs, average='macro', pos_label=1)
        print('PRC AUC: %f' % prc_auc)
        
        # Final evaluation of the models
        scores = model.evaluate(self.validation_data[0], self.validation_data[1], verbose=0)
        print("Accuracy: %.2f%%" % (scores[1]*100))
        
        with open(self.filename, 'a') as f_object:
        
            writer_object = csv.writer(f_object)
            
            writer_object.writerow(["%.4f" % roc_auc, "%.4f" % prc_auc,"%.4f" % logs['loss'], "%.4f" % (logs['accuracy']*100), "%.4f" % scores[0], "%.4f" % (scores[1]*100), "%.5f" % self.epoch_time_end])

            f_object.close()


### Function to save averages and its standard deviation


In [None]:
def save_avg(input_path):
  avg_in = list()
  directory = input_path
  for filename in os.listdir(directory):
          if filename.endswith(".csv") and filename != "average.csv" and filename != "average_with_std.csv":
                  print("file: ",filename)
                  df = pd.read_csv(directory+filename)
                  print(df.to_numpy()[0][0])
                  avg_in.append(df.to_numpy().tolist())

  avg = np.mean(avg_in,axis=0)
  avg_std = np.std(avg_in,axis=0)
  save_file = directory + "average_with_std.csv"
  header = ["roc", "prc",  "train loss", "train acc", "val loss", "val acc", "epoch time"]
  header_std = ["roc std", "prc std", "val acc std"]
  data = pd.DataFrame({header[0]: avg[:,0], header_std[0]:avg_std[:,0], header[1]: avg[:,1], header_std[1]:avg_std[:,1], header[2]: avg[:,2], header[3]: avg[:,3], header[4]: avg[:,4], header[5]: avg[:,5], header_std[2]:avg_std[:,5], header[6]: avg[:,6]})
  data.to_csv(save_file, index=False)


### Function to store top values 

In [None]:
def top_values(directory, lr, filename_save, filename_save_approx):
  filename = directory + "lr_"+ str(lr) + "/average_with_std.csv"
  # filename = "average_with_std.csv"
  data = pd.read_csv(filename)
  data = data.to_numpy()
  max_roc, max_prc, max_acc = 0, 0, 0
  max_roc_i, max_prc_i, max_acc_i = 0, 0, 0
  for i in range(len(data)):
    # print(i)
    if data[i][0]>max_roc :
      max_roc = data[i][0]
      max_roc_i = i

    if data[i][2]>max_prc :
      max_prc = data[i][2]
      max_prc_i = i

    if data[i][7]>max_acc :
      max_acc = data[i][7]
      max_acc_i = i

  print(max_roc)
  print(max_prc)
  print(max_acc)

  with open(filename_save, 'a') as f_object:
      writer_object = csv.writer(f_object)
      writer_object.writerow([lr, max_roc_i + 1, data[max_roc_i][0], data[max_roc_i][1], data[max_roc_i][2], data[max_roc_i][3], data[max_roc_i][7], data[max_roc_i][8], 64, 0])      
      writer_object.writerow([lr, max_prc_i + 1, data[max_prc_i][0], data[max_prc_i][1], data[max_prc_i][2], data[max_prc_i][3], data[max_prc_i][7], data[max_prc_i][8], 64, 0])      
      writer_object.writerow([lr, max_acc_i + 1, data[max_acc_i][0], data[max_acc_i][1], data[max_acc_i][2], data[max_acc_i][3], data[max_acc_i][7], data[max_acc_i][8], 64, 0])      
      writer_object.writerow(["","","","","","","","","",""])
      f_object.close()

  with open(filename_save_approx, 'a') as f_object:
      writer_object = csv.writer(f_object)
      writer_object.writerow([lr, max_roc_i + 1, "%.3f" % data[max_roc_i][0], "%.3f" % data[max_roc_i][1], "%.3f" % data[max_roc_i][2], "%.3f" % data[max_roc_i][3], "%.3f" % data[max_roc_i][7], "%.3f" % data[max_roc_i][8], 64, 0])      
      writer_object.writerow([lr, max_prc_i + 1, "%.3f" % data[max_prc_i][0], "%.3f" % data[max_prc_i][1], "%.3f" % data[max_prc_i][2], "%.3f" % data[max_prc_i][3], "%.3f" % data[max_prc_i][7], "%.3f" % data[max_prc_i][8], 64, 0])      
      writer_object.writerow([lr, max_acc_i + 1, "%.3f" % data[max_acc_i][0], "%.3f" % data[max_acc_i][1], "%.3f" % data[max_acc_i][2], "%.3f" % data[max_acc_i][3], "%.3f" % data[max_acc_i][7], "%.3f" % data[max_acc_i][8], 64, 0])      
      writer_object.writerow(["","","","","","","","","",""])
      f_object.close()

### Code for LSTM

#### Make directory

In [None]:
!mkdir results/{dataset}/{edge_type}/{time_type}/lstm

In [None]:
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))

#### Hyperparam setting

In [None]:
embedding_vector_length = 32
epoch = 350
batch_size = 64
lr_arr = [0.01, 0.001, 0.0001]

#### Setting to save top values

In [None]:
directory = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/lstm/'

filename_save = directory + "top_code.csv"
filename_save_approx = directory + "top_code_rounded.csv"

with open(filename_save, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

with open(filename_save_approx, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

#### Model

In [None]:
for lr in lr_arr:
    !mkdir results/{dataset}/{edge_type}/{time_type}/lstm/lr_{lr}
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    path = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/lstm/lr_'+str(lr)+'/'
    
    print("\nEpoch no: ", epoch, "\tLearning rate: ", lr)
    fold = 0
    for train, test in kfold.split(X, Y):
        fold = fold + 1
        filename = path+dataset+'_'+str(fold)+'.csv'
        
        print('\nFold no: ', fold)
        X_train = X[train]
        y_train = Y[train]
        X_test = X[test]
        y_test = Y[test]
        print(y_train)
        
        model = Sequential()
        model.add(Embedding(TOP_WORDS, embedding_vector_length,
                            input_length=max_review_length))
        model.add(LSTM(50))
        model.add(Dense(1, activation='sigmoid'))
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(loss='binary_crossentropy',
                        optimizer=opt, metrics=['accuracy'])
        print(model.summary())

        model.fit(X_train, y_train, 
            validation_data=(X_test, y_test), epochs=epoch, batch_size=batch_size, callbacks=[PredictionCallback((X_test, y_test), filename)])

    params = {"kfold":fold,"embedding_size": embedding_vector_length, "vocab_size": len(data), "max_len": max_len, "TOP_WORDS": TOP_WORDS, "Epoch": epoch, "lr": lr, "batch_size": batch_size, "Positive": len(reviews_positive), "Negative": len(reviews_negative)}
    print(params)

    with open ('results/'+dataset+'/'+edge_type+'/'+time_type+'/lstm/lr_'+str(lr)+'/params.txt', 'w') as f:
        f.write(json.dumps(params, indent=2))
    
    #save averages
    save_avg(path)
    
    #save top values
    top_values(directory, lr, filename_save, filename_save_approx)
    
print('kfold:', fold)
print(dataset)
print("Top words: ",TOP_WORDS)
print("Embedding size: ", embedding_vector_length)
print('lstm_with_timestamp')
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))

### Code for GRU

#### Make directory

In [None]:
!mkdir results/{dataset}/{edge_type}/{time_type}/gru

#### Hyperparam setting

In [None]:
embedding_vector_length = 32
epoch = 350
batch_size = 64
lr_arr = [0.01, 0.001, 0.0001]

#### Setting to save top values

In [None]:
directory = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/gru/'

filename_save = directory + "top_code.csv"
filename_save_approx = directory + "top_code_rounded.csv"

with open(filename_save, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

with open(filename_save_approx, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

#### Model

In [None]:
for lr in lr_arr:
    !mkdir results/{dataset}/{edge_type}/{time_type}/gru/lr_{lr}
    random.seed(7)
    np.random.seed(7)
    tf.random.set_seed(7)
    path = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/gru/lr_'+str(lr)+'/'
    
    print("\nEpoch no: ", epoch, "\tLearning rate: ", lr)
    fold = 0
    for train, test in kfold.split(X, Y):
        fold = fold + 1
        filename = path+dataset+'_'+str(fold)+'.csv'

        print('\nFold no: ', fold)
        X_train = X[train]
        y_train = Y[train]
        X_test = X[test]
        y_test = Y[test]
        print(len(X_test))
        
        model = Sequential()
        model.add(Embedding(TOP_WORDS, embedding_vector_length,
                            input_length=max_review_length))
        model.add(GRU(50))
        model.add(Dense(1, activation='sigmoid'))
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(loss='binary_crossentropy',
                        optimizer=opt, metrics=['accuracy'])
        # print(model.summary())
        valx = (X_test, y_test)

        model.fit(X_train, y_train, 
            validation_data=(X_test, y_test), epochs=epoch, batch_size=batch_size, callbacks=[PredictionCallback(valx, filename)])
        
    params = {"kfold":fold,"embedding_size": embedding_vector_length, "vocab_size": len(data), "max_len": max_len, "TOP_WORDS": TOP_WORDS, "Epoch": epoch, "lr": lr, "batch_size": batch_size, "Positive": len(reviews_positive), "Negative": len(reviews_negative)}
    print(params)

    with open ('results/'+dataset+'/'+edge_type+'/'+time_type+'/gru/lr_'+str(lr)+'/params.txt', 'w') as f:
        f.write(json.dumps(params, indent=2))
    
    #save averages
    save_avg(path)
    
    #save top values
    top_values(directory, lr, filename_save, filename_save_approx)
    
print('kfold:', fold)
print(dataset)
print("Top words: ",TOP_WORDS)
print("Embedding size: ", embedding_vector_length)
print('gru_with_timestamp')
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))

### Code for BiLSTM

#### Make directory

In [None]:
!mkdir results/{dataset}/{edge_type}/{time_type}/bilstm

#### Hyperparam setting

In [None]:
embedding_vector_length = 32
epoch = 350
batch_size = 64
lr_arr = [0.01, 0.001, 0.0001]

#### Setting to save top values

In [None]:
directory = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/bilstm/'

filename_save = directory + "top_code.csv"
filename_save_approx = directory + "top_code_rounded.csv"

with open(filename_save, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

with open(filename_save_approx, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

In [None]:
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))

#### Model

In [None]:
for lr in lr_arr:
    !mkdir results/{dataset}/{edge_type}/{time_type}/bilstm/lr_{lr}
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    path = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/bilstm/lr_'+str(lr)+'/'
    
    print("\nEpoch no: ", epoch, "\tLearning rate: ", lr)
    fold = 0
    for train, test in kfold.split(X, Y):
        fold = fold+1
        filename = path+dataset+'_'+str(fold)+'.csv'
       
        print('\nFold no: ', fold)
        X_train = X[train]
        y_train = Y[train]
        X_test = X[test]
        y_test = Y[test]
        
        model = Sequential()
        model.add(Embedding(TOP_WORDS, embedding_vector_length,
                            input_length=max_review_length))
        model.add(Bidirectional(LSTM(50)))
        model.add(Dense(1, activation='sigmoid'))
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(loss='binary_crossentropy',
                        optimizer=opt, metrics=['accuracy'])
        print(model.summary())

        model.fit(X_train, y_train, 
            validation_data=(X_test, y_test), epochs=epoch, batch_size=batch_size, callbacks=[PredictionCallback((X_test, y_test), filename)])

    params = {"kfold":fold,"embedding_size": embedding_vector_length, "vocab_size": len(data), "max_len": max_len, "TOP_WORDS": TOP_WORDS, "Epoch": epoch, "lr": lr, "batch_size": batch_size, "Positive": len(reviews_positive), "Negative": len(reviews_negative)}
    print(params)

    with open ('esults/'+dataset+'/'+edge_type+'/'+time_type+'/bilstm/lr_'+str(lr)+'/params.txt', 'w') as f:
        f.write(json.dumps(params, indent=2))

    #save averages
    save_avg(path)
    
    #save top values
    top_values(directory, lr, filename_save, filename_save_approx)
    
print('kfold:', fold)
print(dataset)
print("Top words: ",TOP_WORDS)
print("Embedding size: ", embedding_vector_length)
print('bilstm_without_timestamp')
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))

### Code for Transformer

#### Make directory

In [None]:
!mkdir results/{dataset}/{edge_type}/{time_type}/transformer

#### Hyperparam setting

In [None]:
epoch = 350
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
batch_size = 64
lr_arr = [0.01, 0.001, 0.0001]

#### Setting to save top values

In [None]:
directory = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/transformer/'

filename_save = directory + "top_code.csv"
filename_save_approx = directory + "top_code_rounded.csv"

with open(filename_save, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

with open(filename_save_approx, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["lr", "epoch", "auc roc", "auc roc std", "auc prc", "auc prc std", "val acc", "val acc std", "batch size" ,"epoch time"])

#### Model

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
for lr in lr_arr:
    !mkdir results/{dataset}/{edge_type}/{time_type}/transformer/lr_{lr}
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    path = 'results/'+dataset+'/'+edge_type+'/'+time_type+'/transformer/lr_'+str(lr)+'/'

    print("\nEpoch no: ", epoch, "\tLearning rate: ", lr)
    fold = 0
    for train, test in kfold.split(X, Y):
        fold = fold+1
        filename = path+dataset+'_'+str(fold)+'.csv'
        print('\nFold no: ', fold)
        X_train = X[train]
        y_train = Y[train]
        X_test = X[test]
        y_test = Y[test]
        # print(len(X_test))

        inputs = layers.Input(shape=(max_len,))
        embedding_layer = TokenAndPositionEmbedding(max_len, TOP_WORDS, embed_dim)
        x = embedding_layer(inputs)
        print(x)
        transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        x = transformer_block(x)
        x = layers.GlobalAveragePooling1D()(x)
        x = layers.Dropout(0.1)(x)
        x = layers.Dense(20, activation="relu")(x)
        x = layers.Dropout(0.1)(x)
        outputs = layers.Dense(1, activation="sigmoid")(x)

        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(loss='binary_crossentropy',
                        optimizer=opt, metrics=['accuracy'])
        # print(model.summary())
        valx = (X_test, y_test)

        model.fit(X_train, y_train, 
            validation_data=(X_test, y_test), epochs=epoch, batch_size=batch_size, callbacks=[PredictionCallback(valx, filename)])

    params = {"kfold":fold, "embedding_size": embed_dim, "num_heads": num_heads, "ff_dim": ff_dim, "vocab_size": len(data), "max_len": max_len, "TOP_WORDS": TOP_WORDS, "Epoch": epoch, "lr": lr_arr, "batch_size": batch_size, "Positive": len(reviews_positive), "Negative": len(reviews_negative)}
    print(params)

    with open ('results/'+dataset+'/'+edge_type+'/'+time_type+'/transformer/lr_'+str(lr)+'/params.txt', 'w') as f:
        f.write(json.dumps(params, indent=2))
        
    #save averages
    save_avg(path)
    
    #save top values
    top_values(directory, lr, filename_save, filename_save_approx)

print('kfold:', fold)
print(dataset)
print("Top words: ",TOP_WORDS)
print("Embedding size: ", embed_dim)
print('transformer_without_timestamp')
print('positive',len(reviews_positive))
print('negative',len(reviews_negative))