## Load Data & Dependencies

In [15]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.backend import clear_session
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

X = pd.read_csv('../../data/ogb/processed/ogn_arxiv.csv')
y = pd.read_csv('../../data/ogb/processed/labels.csv')

In [16]:
print(X.shape)
print(y.shape)

(169343, 131)
(169343, 1)


## Split Data Into Train, Validation, & Test Sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y['0'].values, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

## Standard Baseline With Extreme Gradient Boosting

In [18]:
from sklearn.metrics import f1_score

xgb_classifier = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=1,
    n_jobs=-1,
    early_stopping_rounds=10
)

xgb_classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


val_predictions = xgb_classifier.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
val_macro_f1 = f1_score(y_val, val_predictions, average='macro')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Macro-F1: {val_macro_f1:.4f}")

predictions = xgb_classifier.predict(X_test)

[0]	validation_0-mlogloss:3.25492
[1]	validation_0-mlogloss:3.04821
[2]	validation_0-mlogloss:2.89278
[3]	validation_0-mlogloss:2.76977
[4]	validation_0-mlogloss:2.66710
[5]	validation_0-mlogloss:2.57849
[6]	validation_0-mlogloss:2.50234
[7]	validation_0-mlogloss:2.43509
[8]	validation_0-mlogloss:2.37480
[9]	validation_0-mlogloss:2.32113
[10]	validation_0-mlogloss:2.27252
[11]	validation_0-mlogloss:2.22790
[12]	validation_0-mlogloss:2.18732
[13]	validation_0-mlogloss:2.14981
[14]	validation_0-mlogloss:2.11561
[15]	validation_0-mlogloss:2.08397
[16]	validation_0-mlogloss:2.05481
[17]	validation_0-mlogloss:2.02744
[18]	validation_0-mlogloss:2.00180
[19]	validation_0-mlogloss:1.97782
[20]	validation_0-mlogloss:1.95492
[21]	validation_0-mlogloss:1.93438
[22]	validation_0-mlogloss:1.91503
[23]	validation_0-mlogloss:1.89651
[24]	validation_0-mlogloss:1.87933
[25]	validation_0-mlogloss:1.86309
[26]	validation_0-mlogloss:1.84808
[27]	validation_0-mlogloss:1.83340
[28]	validation_0-mlogloss:1.8

In [19]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, output_dict=True, zero_division=0)

f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.5586
F1: 0.3380
Recall: 0.3008
Precision: 0.4700


## Small Deep‑Learning Text Classifier

In [21]:
clear_session()

num_classes = len(set(y_train))
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu', name='dense_1'),
    Dropout(0.3),
    Dense(64, activation='relu', name='dense_2'),
    Dense(num_classes, activation='softmax', name='output')
])

model.compile(
    optimizer= Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
)

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    verbose=True,
)

y_predictions_probability = model.predict(X_test)
nn_predictions = np.argmax(y_predictions_probability, axis=1)

Epoch 1/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.4211 - loss: 2.1193 - val_accuracy: 0.5261 - val_loss: 1.6404
Epoch 2/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5229 - loss: 1.6362 - val_accuracy: 0.5363 - val_loss: 1.5935
Epoch 3/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5414 - loss: 1.5677 - val_accuracy: 0.5445 - val_loss: 1.5619
Epoch 4/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5495 - loss: 1.5349 - val_accuracy: 0.5492 - val_loss: 1.5431
Epoch 5/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5524 - loss: 1.5067 - val_accuracy: 0.5510 - val_loss: 1.5370
Epoch 6/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5663 - loss: 1.4712 - val_accuracy: 0.5596 - val_loss: 1.5179
Epoch 7/20

In [22]:
f1 = report['macro avg']['f1-score']
recall = report['macro avg']['recall']
precision = report['macro avg']['precision']

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

Accuracy: 0.5441
F1: 0.3245
Recall: 0.2877
Precision: 0.4724
