## Load Data & Dependencies

In [16]:
!pip install ogb --quiet
print("Install ogb")

Install ogb


In [17]:
from keras.src.utils.module_utils import tensorflow
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from ogb.nodeproppred import NodePropPredDataset
from keras.backend import clear_session
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.backend import clear_session
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
import warnings
import torch

warnings.filterwarnings("ignore", message=".*weights_only=False.*")

original_load = torch.load
torch.load = lambda *args, **kwargs: original_load(*args, weights_only=False, **kwargs)

dataset = NodePropPredDataset(name="ogbn-arxiv", root="../../data/ogb")
graph, labels = dataset[0]
X = pd.DataFrame(data=graph['node_feat'])
y = pd.DataFrame(data=labels.flatten(), columns=['label'])

## Split Data Into Train, Validation, & Test Sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y['label'].values, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

## Standard Baseline With Extreme Gradient Boosting

In [19]:
from sklearn.metrics import f1_score

xgb_classifier = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=1,
    n_jobs=-1,
    early_stopping_rounds=10
)

xgb_classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


val_predictions = xgb_classifier.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
val_macro_f1 = f1_score(y_val, val_predictions, average='macro')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Macro-F1: {val_macro_f1:.4f}")

predictions = xgb_classifier.predict(X_test)

[0]	validation_0-mlogloss:3.27371
[1]	validation_0-mlogloss:3.06891
[2]	validation_0-mlogloss:2.91675
[3]	validation_0-mlogloss:2.79527
[4]	validation_0-mlogloss:2.69452
[5]	validation_0-mlogloss:2.60856
[6]	validation_0-mlogloss:2.53288
[7]	validation_0-mlogloss:2.46786
[8]	validation_0-mlogloss:2.40855
[9]	validation_0-mlogloss:2.35525
[10]	validation_0-mlogloss:2.30725
[11]	validation_0-mlogloss:2.26327
[12]	validation_0-mlogloss:2.22333
[13]	validation_0-mlogloss:2.18704
[14]	validation_0-mlogloss:2.15304
[15]	validation_0-mlogloss:2.12199
[16]	validation_0-mlogloss:2.09283
[17]	validation_0-mlogloss:2.06606
[18]	validation_0-mlogloss:2.04093
[19]	validation_0-mlogloss:2.01796
[20]	validation_0-mlogloss:1.99628
[21]	validation_0-mlogloss:1.97564
[22]	validation_0-mlogloss:1.95664
[23]	validation_0-mlogloss:1.93852
[24]	validation_0-mlogloss:1.92182
[25]	validation_0-mlogloss:1.90621
[26]	validation_0-mlogloss:1.89150
[27]	validation_0-mlogloss:1.87739
[28]	validation_0-mlogloss:1.8

In [20]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, output_dict=True, zero_division=0)

f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.5441
F1: 0.3245
Recall: 0.2877
Precision: 0.4724


## Small Deep‑Learning Text Classifier

In [21]:
clear_session()

num_classes = len(set(y_train))
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu', name='dense_1'),
    Dense(64, activation='relu', name='dense_2'),
    Dense(num_classes, activation='softmax', name='output')
])

model.compile(
    optimizer= Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
)

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    verbose=True,
)

y_predictions_probability = model.predict(X_test)
nn_predictions = np.argmax(y_predictions_probability, axis=1)

Epoch 1/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.4211 - loss: 2.1193 - val_accuracy: 0.5261 - val_loss: 1.6404
Epoch 2/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5229 - loss: 1.6362 - val_accuracy: 0.5363 - val_loss: 1.5935
Epoch 3/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5414 - loss: 1.5677 - val_accuracy: 0.5445 - val_loss: 1.5619
Epoch 4/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5495 - loss: 1.5349 - val_accuracy: 0.5492 - val_loss: 1.5431
Epoch 5/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5524 - loss: 1.5067 - val_accuracy: 0.5510 - val_loss: 1.5370
Epoch 6/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5663 - loss: 1.4712 - val_accuracy: 0.5596 - val_loss: 1.5179
Epoch 7/20

In [22]:
f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.5441
F1: 0.3245
Recall: 0.2877
Precision: 0.4724
