# Installation and imports

In [None]:
!pip install -q -r requirements.txt

In [None]:
!pip install -q \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu12==23.12.* dask-cudf-cu12==23.12.* cuml-cu12==23.12.* \
    cugraph-cu12==23.12.* cuspatial-cu12==23.12.* cuproj-cu12==23.12.* \
    cuxfilter-cu12==23.12.* cucim-cu12==23.12.* pylibraft-cu12==23.12.* \
    raft-dask-cu12==23.12.*

In [None]:
# GPU-accelerated models (for ensemble)
from cuml import svm
from cuml import LogisticRegression
from cuml.common import logger

logger.set_level(logger.level_warn)

In [None]:
import datasets
import experiments
import features
import utilities
import gat
import gcn
import node2vec
import ensemble
import torch
import model
import pickle as pk
import os
from google.colab import drive
import time
drive.mount('/content/drive')

# Experiment 1: basic GAT + combinations of structural and positional features


In [None]:
G, data = datasets.load_data('ogbn-arxiv')
print(data)

In [None]:
# Global model variables

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATBase(data, 8, 8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8]

In [None]:
# Global features

original_features = data.x.to(device)
structural_features = features.structural_features(G, ['cc', 'bc', 'dc', 'ec', 'pr', 'cn', 'lc', 'nd', 'kc']).to(device)
positional_features = features.positional_features(data)

In [None]:
# Experiments helper functions. Keep in mind: They use the global variables!

def gat_base_factory(data, hidden_channels, heads):
  return gat.GATBase(data, hidden_channels, heads)

def gat_pre_factory(data, hidden_channels, heads, mlp_hidden_channels):
  return gat.GATPre(data, hidden_channels, heads, mlp_hidden_channels)

def gcn_base_factory(data, hidden_channels):
  return gcn.GCNBase(data, hidden_channels)

def gcn_pre_factory(data, hidden_channels, mlp_hidden_channels):
  return gcn.GCNPre(data, hidden_channels, mlp_hidden_channels)

def run_feature_combinations(file_name, model_factory, global_model_params, normalization=lambda x: x):
    features_combinations = [
      original_features,
      structural_features,
      positional_features,
      utilities.concatenate(original_features,structural_features),
      utilities.concatenate(original_features,positional_features),
      utilities.concatenate(structural_features,positional_features),
      utilities.concatenate(original_features,structural_features,positional_features)]

    file_names = [
      'original',
      'structural',
      'positional',
      'original-structural',
      'original-positional',
      'structural-positional',
      'original-structural-positional']

    basic_models = dict()
    for curr_features, curr_file_name in zip(features_combinations, file_names):
        data.x = curr_features
        data.x = normalization(data.x)

        if data.name=='Cora' and (curr_file_name=='original' or curr_file_name=='original-structural' or curr_file_name=='original-positional' or curr_file_name=='original-structural-positional'):
          split = curr_features.split([orig_num_feat,curr_features.size()[1]-orig_num_feat],dim=-1)
          orig_feats = split[0]
          other_feats = split[1]
          other_feats_norm = normalization(other_feats)
          data.x = utilities.concatenate(orig_feats,other_feats_norm)

        model = model_factory(data, *global_model_params)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

        results = dict()
        results['avg_acc'], results['test_accs'], results['train_losses'], results['train_accs'], results['val_losses'], results['val_accs'], results['run_times'],results['best_epoch'] = experiments.run_experiments(model, data, n_runs, n_epochs, optimizer, criterion, device) # These should be "global variables"
        results['model'] = model

        basic_models[curr_file_name] = results

    utilities.save_results(basic_models, file_name)

In [None]:
run_feature_combinations('gat-base-concatenation-without-norm', gat_base_factory, global_model_params)

In [None]:
gat_base_concatenation_without_norm = utilities.load_results('gat-base-concatenation-without-norm')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-base-concatenation-without-norm.pkl')
utilities.save_results(gat_base_concatenation_without_norm, full_path)

In [None]:
print(gat_base_concatenation_without_norm)

### Adding Min-Max Normalization

In [None]:
# Global model variables
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATBase(data, 8, 8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8]

In [None]:
run_feature_combinations('gat-base-concatenation-minmax-norm', gat_base_factory, global_model_params, normalization=utilities.MinMaxNormalization)

In [None]:
gat_base_concatenation_minmax_norm = utilities.load_results('gat-base-concatenation-minmax-norm')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-base-concatenation-minmax-norm.pkl')
utilities.save_results(gat_base_concatenation_minmax_norm, full_path)

In [None]:
print(gat_base_concatenation_minmax_norm)

### Adding Z-Score Normalization

In [None]:
# Global model variables
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATBase(data, 8, 8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8]

In [None]:
run_feature_combinations('gat-base-concatenation-standard-norm', gat_base_factory, global_model_params, normalization=utilities.StandardNormalization)

In [None]:
gat_base_concatenation_standard_norm = utilities.load_results('gat-base-concatenation-standard-norm')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-base-concatenation-standard-norm.pkl')
utilities.save_results(gat_base_concatenation_standard_norm, full_path)

In [None]:
print(gat_base_concatenation_standard_norm)

## Experiment 2: GAT with MLP preprocessing on all the feature combinations

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATPre(data, 8, 8, 128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8, 128]

In [None]:
run_feature_combinations('gat-pre-concatenation-without-norm', gat_pre_factory, global_model_params)

In [None]:
gat_pre_concatenation_without_norm = utilities.load_results('gat-pre-concatenation-without-norm')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-pre-concatenation-without-norm.pkl')
utilities.save_results(gat_pre_concatenation_without_norm, full_path)

In [None]:
print(gat_pre_concatenation_without_norm)

# 160 features

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATPre(data, 8, 8, 160)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8, 160]

In [None]:
run_feature_combinations('gat-pre-concatenation-without-norm-160', gat_pre_factory, global_model_params)

In [None]:
gat_pre_concatenation_without_norm_160 = utilities.load_results('gat-pre-concatenation-without-norm-160')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-pre-concatenation-without-norm-160.pkl')
utilities.save_results(gat_pre_concatenation_without_norm_160, full_path)

In [None]:
print(gat_pre_concatenation_without_norm_160)

# Adding Z-Score Normalization

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATPre(data, 8, 8, 128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8, 128]

In [None]:
run_feature_combinations('gat-pre-concatenation-standard-norm', gat_pre_factory, global_model_params, utilities.StandardNormalization)

In [None]:
gat_pre_concatenation_standard_norm = utilities.load_results('gat-pre-concatenation-standard-norm')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-pre-concatenation-standard-norm.pkl')
utilities.save_results(gat_pre_concatenation_standard_norm, full_path)

In [None]:
print(gat_pre_concatenation_standard_norm)

# 160 Standard norm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gat.GATPre(data, 8, 8, 160)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 10
global_model_params = [8, 8, 160]

In [None]:
run_feature_combinations('gat-pre-concatenation-standard-norm-160', gat_pre_factory, global_model_params, utilities.StandardNormalization)

In [None]:
gat_pre_concatenation_standard_norm_160 = utilities.load_results('gat-pre-concatenation-standard-norm-160')

In [None]:
full_path = os.path.join("/content/drive/My Drive/", 'gat-pre-concatenation-standard-norm-160.pkl')
utilities.save_results(gat_pre_concatenation_standard_norm_160, full_path)

In [None]:
print(gat_pre_concatenation_standard_norm_160)

## GAT ensemble


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
import numpy
import gc
from model import model_training

In [None]:
data_clone = data.clone()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_original = gat.GATBase(data, 8, 8)
model_positional = gat.GATBase(data, 8, 8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 200
n_runs = 5

In [None]:
identity_normalizer = FunctionTransformer(lambda x: x)

In [None]:
data = data.to(device)

In [None]:
def run_ensemble(data_orig, classifier, scaler, n_runs, file_name):

  test_accs, run_times = [], []
  for i in range(n_runs):
    print(f"\n RUN: {i}\n")

    data = data_orig.clone()

    start_time = time.time()

    data.val_mask, data.ensemble_val_mask = ensemble.get_val_set_split(data)

    data.x = original_features
    model_original = gat.GATBase(data, 8, 8)
    model_original = model_original.to(device)
    optimizer = torch.optim.Adam(model_original.parameters(), lr=0.005, weight_decay=5e-4)
    train_losses, train_accs, val_losses, val_accs, best_epoch = model_training(n_epochs, model_original, data, optimizer, criterion)

    print(f"\n Model with original features: training completed\n")

    data.x = positional_features
    model_positional = gat.GATBase(data, 8, 8)
    model_positional = model_positional.to(device)
    optimizer = torch.optim.Adam(model_positional.parameters(), lr=0.005, weight_decay=5e-4)
    train_losses, train_accs, val_losses, val_accs, best_epoch = model_training(n_epochs, model_positional, data, optimizer, criterion)

    print(f"\n Model with positional features: training completed\n")

    models = [model_original, model_positional]
    features = [original_features, positional_features]

    meta_model_train = ensemble.get_meta_model_features(models, features, data.ensemble_val_mask, data.edge_index)
    meta_model_test = ensemble.get_meta_model_features(models, features, data.test_mask, data.edge_index)

    X_train = meta_model_train.cpu().numpy()
    y_train = data.y[data.ensemble_val_mask].cpu().numpy()
    X_test = meta_model_test.cpu().numpy()
    y_test = data.y[data.test_mask].cpu().numpy()

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    classifier.fit(X_train_scaled, y_train)

    end_time = time.time()

    y_pred = classifier.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    test_accs.append(accuracy)
    run_times.append(end_time - start_time)

    print(f"\n Ensemble: training completed")
    print(f"Ensemble accuracy: {accuracy}")

    gc.collect()

  results = dict()
  results['test_accs'] = test_accs
  results['avg_acc'] = sum(test_accs) / len(test_accs)
  results['model'] = classifier
  results['run_time'] = run_times

  utilities.save_results(results, file_name)

# SVM no normalization


In [None]:
run_ensemble(data, svm.SVC(verbose=0), identity_normalizer, 5, 'ensemble_SVM_non_norm')

In [None]:
ensemble_svm_non_norm = utilities.load_results('ensemble_SVM_non_norm')

In [None]:
print(ensemble_svm_non_norm)

# SVM with normalization

In [None]:
run_ensemble(data, svm.SVC(verbose=0), StandardScaler(), 5, 'ensemble_SVM_std_norm')

In [None]:
ensemble_svm_std_norm = utilities.load_results('ensemble_SVM_std_norm')

In [None]:
print(ensemble_svm_std_norm)

# LR no normalization

In [None]:
run_ensemble(data, LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0), identity_normalizer, 5, 'ensemble_LR_no_norm')

In [None]:
ensemble_lr_non_norm = utilities.load_results('ensemble_LR_no_norm')

In [None]:
print(ensemble_lr_non_norm)

# LR with normalization

In [None]:
run_ensemble(data, LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0), StandardScaler(), 5, 'ensemble_LR_std_norm')

In [None]:
ensemble_lr_std_norm = utilities.load_results('ensemble_LR_std_norm')

In [None]:
print(ensemble_lr_std_norm)

# DT no normalization

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
run_ensemble(data, DecisionTreeClassifier(), identity_normalizer, 5, 'ensemble_DT_non_norm')

In [None]:
ensemble_dt_non_norm = utilities.load_results('ensemble_DT_non_norm')

In [None]:
print(ensemble_dt_non_norm)

# DT with normalization

In [None]:
run_ensemble(data, DecisionTreeClassifier(), StandardScaler(), 5, 'ensemble_DT_std_norm')

In [None]:
ensemble_dt_std_norm = utilities.load_results('ensemble_DT_std_norm')

In [None]:
print(ensemble_dt_std_norm)