In [None]:
!pip install -q -r requirements.txt

In [None]:
!pip install -q \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu12==23.12.* dask-cudf-cu12==23.12.* cuml-cu12==23.12.* \
    cugraph-cu12==23.12.* cuspatial-cu12==23.12.* cuproj-cu12==23.12.* \
    cuxfilter-cu12==23.12.* cucim-cu12==23.12.* pylibraft-cu12==23.12.* \
    raft-dask-cu12==23.12.*

In [None]:
from cuml import svm
from cuml import LogisticRegression
from cuml.common import logger
import datasets
import experiments
import features
import utilities
import gcn
import node2vec
import ensemble
import torch
from model import model_training
import pickle as pk
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler,FunctionTransformer, Normalizer
import numpy
import gc
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone

In [None]:
# pass 'ogbn-arxiv' to load ArXiv dataset
G, data = datasets.load_data('cora')
print(data)

# Experiment 1: Linear models + combinations of structural and positional features

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# global features
original_features = data.x.to(device)
structural_features = features.structural_features(G,['cc', 'bc', 'dc', 'ec', 'pr', 'cn', 'lc', 'nd', 'kc']).to(device)
positional_features = features.positional_features(data,128,50)

In [None]:
#structural_features=utilities.load_results('structural_features')
#positional_features=utilities.load_results('positional_features')

In [None]:
#%load_ext autoreload
#%autoreload 2

In [None]:
def run_feature_combinations(file_name, classifier_original, normalization=lambda x: x):
    features_combinations = [
      original_features,
      structural_features,
      positional_features,
      utilities.concatenate(original_features,structural_features),
      utilities.concatenate(original_features,positional_features),
      utilities.concatenate(structural_features,positional_features),
      utilities.concatenate(original_features,structural_features,positional_features)]

    file_names = [
      'original',
      'structural',
      'positional',
      'original-structural',
      'original-positional',
      'structural-positional',
      'original-structural-positional']

    basic_models = dict()
    orig_num_feat = original_features.size()[1]
    for curr_features, curr_file_name in zip(features_combinations, file_names):
        classifier = clone(classifier_original)

        data.x = curr_features
        data.x = normalization(data.x)

        if data.name=='Cora' and (curr_file_name=='original' or curr_file_name=='original-structural' or curr_file_name=='original-positional' or curr_file_name=='original-structural-positional'):
          split = curr_features.split([orig_num_feat,curr_features.size()[1]-orig_num_feat],dim=-1)
          orig_feats = split[0]
          other_feats = split[1]
          other_feats_norm = normalization(other_feats)
          data.x = utilities.concatenate(orig_feats,other_feats_norm)

        X_train = data.x[data.train_mask].cpu().numpy()
        y_train = data.y[data.train_mask].cpu().numpy()
        X_test = data.x[data.test_mask].cpu().numpy()
        y_test = data.y[data.test_mask].cpu().numpy()

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        basic_models[curr_file_name] = {'avg_acc': accuracy}

        print(f'Training {curr_file_name} completed!')


    utilities.save_results(basic_models, file_name)

## Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
run_feature_combinations('lr_arxiv', classifier)
lr_arxiv = utilities.load_results('lr_arxiv')
print(lr_arxiv)

## Logistic Regression + Min-Max Normalization

In [None]:
classifier = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
normalization = lambda x : utilities.MinMaxNormalization(x)
run_feature_combinations('lr_minmax_arxiv', classifier, normalization)
lr_minmax_arxiv = utilities.load_results('lr_minmax_arxiv')
print(lr_minmax_arxiv)

## Logistic Regression + Std Normalization

In [None]:
classifier = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
normalization = lambda x : utilities.StandardNormalization(x)
run_feature_combinations('lr_std_arxiv', classifier, normalization)
lr_std_arxiv = utilities.load_results('lr_std_arxiv')
print(lr_std_arxiv)

## SVM

In [None]:
classifier = svm.SVC(verbose=0)
run_feature_combinations('svm_arxiv', classifier)
svm_arxiv = utilities.load_results('svm_arxiv')
print(svm_arxiv)

## SVM + Min-Max Normalization

In [None]:
classifier = svm.SVC(verbose=0)
normalization = lambda x : utilities.MinMaxNormalization(x)
run_feature_combinations('svm_minmax_arxiv', classifier, normalization)
svm_minmax_arxiv = utilities.load_results('svm_minmax_arxiv')
print(svm_minmax_arxiv)

## SVM + Std Normalization

In [None]:
classifier = svm.SVC(verbose=0)
normalization = lambda x : utilities.StandardNormalization(x)
run_feature_combinations('svm_std_arxiv', classifier, normalization)
svm_std_arxiv = utilities.load_results('svm_std_arxiv')
print(svm_std_arxiv)

## Decision Tree

In [None]:
classifier = DecisionTreeClassifier(random_state=404)
run_feature_combinations('dt_arxiv', classifier)
dt_arxiv = utilities.load_results('dt_arxiv')
print(dt_arxiv)

## Decision Tree + Min-Max Normalization

In [None]:
classifier = DecisionTreeClassifier(random_state=404)
normalization = lambda x : utilities.MinMaxNormalization(x)
run_feature_combinations('dt_minmax_arxiv', classifier, normalization)
dt_minmax_arxiv = utilities.load_results('dt_minmax_arxiv')
print(dt_minmax_arxiv)

## Decision Tree + Std Normalization

In [None]:
classifier = DecisionTreeClassifier(random_state=404)
normalization = lambda x : utilities.StandardNormalization(x)
run_feature_combinations('dt_std_arxiv', classifier, normalization)
dt_std_arxiv = utilities.load_results('dt_std_arxiv')
print(dt_std_arxiv)

## Experiment 2: linear models + combinations of structural and positional feature + Ensemble

In [None]:
def run_ensemble(data_orig, classifier_meta, scaler_meta, classifier_base, file_name):

  data = data_orig.clone()
  #data.val_mask, data.ensemble_val_mask = ensemble.get_val_set_split(data)

  features = [original_features, positional_features]
  models = []
  for feat in features:
    cl = clone(classifier_base)
    data.x = feat

    X_train = data.x[data.train_mask].cpu().numpy()
    y_train = data.y[data.train_mask].cpu().numpy()
    X_test = data.x[data.test_mask].cpu().numpy()
    y_test = data.y[data.test_mask].cpu().numpy()

    cl.fit(X_train, y_train)

    models.append(cl)

    del cl
    gc.collect()

  meta_model_train = ensemble.get_meta_model_features(models, features, data.val_mask, data.edge_index,linear=True)
  meta_model_test = ensemble.get_meta_model_features(models, features, data.test_mask, data.edge_index,linear=True)

  del models
  gc.collect()

  X_train = meta_model_train.cpu().numpy()
  y_train = data.y[data.val_mask].cpu().numpy()
  X_test = meta_model_test.cpu().numpy()
  y_test = data.y[data.test_mask].cpu().numpy()

  X_train_scaled = scaler_meta.fit_transform(X_train)
  X_test_scaled = scaler_meta.transform(X_test)

  classifier_meta.fit(X_train_scaled, y_train)

  y_pred = classifier_meta.predict(X_test_scaled)
  accuracy = accuracy_score(y_test, y_pred)

  print(f"Ensemble training completed")
  print(f"Ensemble accuracy: {accuracy}")

  results = dict()
  results['avg_acc'] = accuracy

  utilities.save_results(results, file_name)

  del classifier_meta
  gc.collect()

## SVM as meta and DT as base

In [None]:
classifier_meta = svm.SVC(verbose=0)
classifier_base = DecisionTreeClassifier(random_state=404)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_svm_dt_arxiv')
ensemble_svm_dt_arxiv = utilities.load_results('ensemble_svm_dt_arxiv')
print(ensemble_svm_dt_arxiv)

## SVM as meta and LR as base

In [None]:
classifier_meta = svm.SVC(verbose=0)
classifier_base = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_svm_lr_arxiv')
ensemble_svm_lr_arxiv = utilities.load_results('ensemble_svm_lr_arxiv')
print(ensemble_svm_lr_arxiv)

## LR as meta and DT as base

In [None]:
classifier_meta = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
classifier_base = DecisionTreeClassifier(random_state=404)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_lr_dt_arxiv')
ensemble_lr_dt_arxiv = utilities.load_results('ensemble_lr_dt_arxiv')
print(ensemble_lr_dt_arxiv)

## LR as meta and SVM as base

In [None]:
classifier_meta = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
classifier_base = svm.SVC(verbose=0)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_lr_svm_arxiv')
ensemble_lr_svm_arxiv = utilities.load_results('ensemble_lr_svm_arxiv')
print(ensemble_lr_svm_arxiv)

## LR + min-max as meta and SVM as base

In [None]:
classifier_meta = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
classifier_base = svm.SVC(verbose=0)
scaler_meta = MinMaxScaler()

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_lr_minmax_svm_arxiv')
ensemble_lr_minmax_svm_arxiv = utilities.load_results('ensemble_lr_minmax_svm_arxiv')
print(ensemble_lr_minmax_svm_arxiv)

## DT as meta and LR as base

In [None]:
classifier_meta = DecisionTreeClassifier(random_state=404)
classifier_base = LogisticRegression(max_iter=10000, multi_class="multinomial",verbose=0)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_dt_lr_arxiv')
ensemble_dt_lr_arxiv = utilities.load_results('ensemble_dt_lr_arxiv')
print(ensemble_dt_lr_arxiv)

## DT as meta and SVM as base

In [None]:
classifier_meta = DecisionTreeClassifier(random_state=404)
classifier_base = svm.SVC(verbose=0)
scaler_meta = FunctionTransformer(lambda x: x)

run_ensemble(data, classifier_meta, scaler_meta, classifier_base, 'ensemble_dt_svm_arxiv')
ensemble_dt_svm_arxiv = utilities.load_results('ensemble_dt_svm_arxiv')
print(ensemble_dt_svm_arxiv)