In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

In [2]:
df = pd.read_parquet("/home/sunbeam/STUDY_NEW/PROJECT/data/processed/arxiv_cleaned.parquet")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39440 entries, 0 to 39439
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   abstract          39440 non-null  object
 1   authors           39440 non-null  object
 2   authors_parsed    39440 non-null  object
 3   comments          24248 non-null  object
 4   doi               8837 non-null   object
 5   id                39440 non-null  object
 6   journal-ref       7022 non-null   object
 7   license           37340 non-null  object
 8   report-no         821 non-null    object
 9   submitter         39407 non-null  object
 10  title             39440 non-null  object
 11  update_date       39440 non-null  object
 12  versions          39440 non-null  object
 13  primary_category  39440 non-null  object
 14  final_category    39440 non-null  object
dtypes: object(15)
memory usage: 4.5+ MB


In [None]:
df.describe()

Only keep Required columns

In [4]:
df = df[["abstract", "final_category"]].dropna()

TEXT EDA

Abstract length distribution

In [5]:
df["length"] = df["abstract"].str.split().str.len()

df["length"].describe(percentiles=[0.5, 0.9, 0.95, 0.99])


count    39440.000000
mean       144.144752
std         61.989732
min          5.000000
50%        144.000000
90%        228.000000
95%        250.000000
99%        281.000000
max        567.000000
Name: length, dtype: float64

Class distribution

In [6]:
df["final_category"].value_counts(normalize=True)


final_category
cs             0.428778
math           0.342672
physics        0.125203
engineering    0.040948
stat           0.035827
bio            0.020360
econ           0.006212
Name: proportion, dtype: float64

Vocabulary size estimate

In [7]:
from collections import Counter
import nltk

tokens = []
for t in df["abstract"].head(5000):
    tokens.extend(nltk.word_tokenize(t.lower()))

vocab_size = len(set(tokens))
vocab_size


38753

TEXT PREPROCESSING

In [8]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\$.*?\$", " ", text)  # remove latex math
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["clean_abstract"] = df["abstract"].apply(clean_text)


LABEL ENCODING

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["final_category"])
num_classes = len(le.classes_)


In [20]:
with open("label_encoder_category.pkl","wb") as file:
    pickle.dump(le,file)

TRAIN / TEST SPLIT (STRATIFIED)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["clean_abstract"],
    df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)


TOKENIZATION & PADDING

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 40000
MAX_LEN = 150
  # from EDA

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")


2026-01-21 12:00:54.853533: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-21 12:00:55.071291: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-21 12:00:56.525182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [13]:
import pickle

with open("tokenizer.pkl","wb") as file:
    pickle.dump(tokenizer,file)

EMBEDDING GLOVE LAYER

In [12]:
EMBEDDING_DIM = 100
embedding_index = {}

with open("/home/sunbeam/STUDY_NEW/PROJECT/data/embeddings/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

In [13]:
word_index = tokenizer.word_index
vocab_size = min(MAX_WORDS, len(word_index)) + 1

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= vocab_size:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

In [14]:
print(np.mean(embedding_matrix))
print(np.std(embedding_matrix))


0.0010946105480698644
0.4208363194609128


BUILD BiLSTM MODEL

In [15]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_LEN,
    trainable=False   # VERY important for GloVe
)


I0000 00:00:1768971494.830720    5954 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6287 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Bidirectional, LSTM, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam

model = Sequential([
    embedding_layer,
    Bidirectional(LSTM(64,dropout=0.2)),
    Dropout(0.5),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax")
])


HANDLE CLASS IMBALANCE

In [17]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))


In [19]:
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime
import os

log_dir = os.path.join(
    "logs",
    "fit",
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)

tensorboard_cb = TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,     # weight histograms
    write_graph=True,
    write_images=False,
    update_freq="epoch"
)


early_stop = EarlyStopping(
    monitor="val_loss",        # watch validation loss
    patience=6,                # stop after 3 epochs of no improvement
    restore_best_weights=True  # rollback to best model
)

In [20]:
embedding_layer.trainable = False
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(learning_rate=1e-3),
    metrics=["accuracy"]
)

model.summary()

history_phase1 = model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=8,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop, tensorboard_cb]
)

Epoch 1/8


2026-01-21 10:31:09.428644: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91800


[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.4048 - loss: 1.7336 - val_accuracy: 0.5633 - val_loss: 1.2805
Epoch 2/8
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.4909 - loss: 1.4094 - val_accuracy: 0.5292 - val_loss: 1.1643
Epoch 3/8
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5156 - loss: 1.2412 - val_accuracy: 0.6024 - val_loss: 1.0666
Epoch 4/8
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5604 - loss: 1.1028 - val_accuracy: 0.6530 - val_loss: 0.9850
Epoch 5/8
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5938 - loss: 1.0488 - val_accuracy: 0.6023 - val_loss: 0.9838
Epoch 6/8
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6113 - loss: 0.9779 - val_accuracy: 0.6016 - val_loss: 1.0159
Epoch 7/8
[1m395/395[0m [32m━━━━━━━

In [21]:
embedding_layer.trainable = True
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(learning_rate=1e-4),
    metrics=["accuracy"]
)

history_phase2 = model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop,tensorboard_cb]
)



Epoch 1/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.6647 - loss: 0.8188 - val_accuracy: 0.6777 - val_loss: 0.8674
Epoch 2/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.6743 - loss: 0.7803 - val_accuracy: 0.6943 - val_loss: 0.8356
Epoch 3/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.6901 - loss: 0.7452 - val_accuracy: 0.6852 - val_loss: 0.8577
Epoch 4/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.6886 - loss: 0.7249 - val_accuracy: 0.7138 - val_loss: 0.8048
Epoch 5/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.7003 - loss: 0.6970 - val_accuracy: 0.7167 - val_loss: 0.7939
Epoch 6/50
[1m395/395[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.7045 - loss: 0.6812 - val_accuracy: 0.7073 - val_loss: 0.8092
Epoch 7/50
[1m395/395

In [22]:
print(history_phase2.history["val_accuracy"][-5:])
print(history_phase2.history["val_loss"][-5:])


[0.7269846200942993, 0.736650288105011, 0.7490096688270569, 0.7445729970932007, 0.7409285306930542]
[0.777579128742218, 0.7635042071342468, 0.7398005127906799, 0.7419564723968506, 0.7577699422836304]


In [23]:
print(max(history_phase2.history["val_accuracy"]))
print(min(history_phase2.history["val_loss"]))


0.7490096688270569
0.7219878435134888


In [26]:
model.save("./models/arxiv_bilstm_model.keras")




In [39]:
X_test[16032]

'a relatively new topic in computability theory is the study of notions of computation that are robust against mistakes on some kind of small set however despite the recent popularity of this topic relatively foundational questions about the notions of reducibility involved still persist in this paper we examine two notions of robust information coding effective dense reducibility and coarse reducibility and answer the question posed in 1 whether the degrees of functions under these reductions are the same as the degrees of sets despite the surface similarity of these two reducibilities we show that every uniform coarse degree contains a set but that this fails even for the non uniform effective dense degrees we then further distinguish these two notions by showing that whether g is coarsely reducible to f is an arithmetic property of f and g while for non uniform effective dense reducibility it is a complete property to prove these results we introduce notions of forcing that allow us

TESTING OF MODEL

In [34]:
import tensorflow as tf

model = tf.keras.models.load_model('/home/sunbeam/STUDY_NEW/PROJECT/src/Modelling/models/arxiv_bilstm_model.keras',compile=False)

with open("/home/sunbeam/STUDY_NEW/PROJECT/src/Modelling/models/label_encoder_category.pkl","rb") as file:
    label_encoder=pickle.load(file)

with open("/home/sunbeam/STUDY_NEW/PROJECT/src/Modelling/models/tokenizer.pkl","rb") as file:
    tokenizer=pickle.load(file)



In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test_clean = X_test.apply(clean_text)

X_test_seq = tokenizer.texts_to_sequences(X_test_clean)

X_test_seq = pad_sequences(
    X_test_seq,
    maxlen=MAX_LEN,
    padding="post"
)



In [36]:
# Predict
y_pred_probs = model.predict(X_test_seq, batch_size=64)
y_pred = np.argmax(y_pred_probs, axis=1)

# True labels
y_true = np.argmax(y_test, axis=1)

# Metrics
from sklearn.metrics import classification_report, f1_score

print("Macro F1:", f1_score(y_true, y_pred, average="macro"))
print("Micro F1:", f1_score(y_true, y_pred, average="micro"))
print("Weighted F1:", f1_score(y_true, y_pred, average="weighted"))

print(classification_report(
    y_true,
    y_pred,
    target_names=label_encoder.classes_
))


[1m  1/124[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:46[0m 867ms/step

2026-01-21 13:22:57.290244: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91800


[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


ValueError: `axis` must be fewer than the number of dimensions (1)