In [1]:
import pandas as pd

# Load data – update the file paths if needed
train_df = pd.read_csv("arxiv_train.csv")
test_df = pd.read_csv("arxiv_test.csv")

# Quick check of columns
print(train_df.columns)
# Expected output: Index(['Unnamed: 0', 'abstract', 'label', 'clean_abstract'], dtype='object')


Index(['Unnamed: 0', 'abstract', 'label'], dtype='object')


b. Using Keras Tokenizer to Convert Text to Sequences

Since the "clean_abstract" column contains strings, we can directly use it with the Tokenizer.

In [None]:
import tensorflow as tf


Tokenizer    = tf.keras.preprocessing.text.Tokenizer
pad_sequences= tf.keras.preprocessing.sequence.pad_sequences

# Set a vocabulary size and an out-of-vocabulary token
vocab_size = 10000  # you can adjust based on your data
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# Fit the tokenizer on the training abstracts
tokenizer.fit_on_texts(train_df["clean_abstract"])

# Convert text to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_df["clean_abstract"])
test_sequences = tokenizer.texts_to_sequences(test_df["clean_abstract"])

c. Padding the Sequences

Choose a maximum sequence length (e.g., based on some percentile of abstract lengths) and pad the sequences so that each input has the same length.

In [None]:
max_length = 200  # adjust as needed based on your data distribution

X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

2. Converting Labels to a Suitable Format

If your labels are given as strings (for example, like "eess", etc.), convert them to integer values using scikit-learn’s LabelEncoder, and then to one-hot encoding if needed.
a. Encode String Labels as Integers

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(train_df["label"])
y_test_encoded = le.transform(test_df["label"])

# Optional: print label mapping to see the conversion
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


b. Convert to One-Hot Encoding (for multi-class classification)

If you’re using a categorical crossentropy loss, convert your integer labels to one-hot vectors.

In [None]:
import tensorflow as tf
to_categorical = tf.keras.utils.to_categorical

num_classes = len(le.classes_)
y_train_cat = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)


3. Building an RNN-Based Model

Now that we have the padded sequences and the encoded labels, we can build our RNN-based classifier. In this step, we’ll demonstrate a simple model using an LSTM layer. The LSTM will process the sequence and return its last hidden state as a fixed-size representation, which is then fed into dense layers to do the classification.
a. Example 1: Using a Single LSTM

In [None]:
import tensorflow as tf

Sequential   = tf.keras.models.Sequential
Dense        = tf.keras.layers.Dense
LSTM         = tf.keras.layers.LSTM
Embedding    = tf.keras.layers.Embedding
Dropout      = tf.keras.layers.Dropout





embedding_dim = 100  # you can adjust (or use a pretrained embedding matrix if desired)

model = Sequential()
# Embedding layer: learns an embedding for each word
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
# LSTM layer; note that it returns only the final state by default (return_sequences=False)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# Optional additional dense layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
# Final output layer; using softmax activation for multi-class classification
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


b. Example 2: Using a Bidirectional LSTM

A Bidirectional LSTM processes the sequence in both forward and backward directions. You can then either take the last hidden state, or concatenate states from both directions.

In [None]:
import tensorflow as tf


Bidirectional= tf.keras.layers.Bidirectional


model_bi = Sequential()
model_bi.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
# Bidirectional LSTM layer; automatically concatenates forward and backward states
model_bi.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model_bi.add(Dense(64, activation='relu'))
model_bi.add(Dropout(0.5))
model_bi.add(Dense(num_classes, activation='softmax'))

model_bi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bi.summary()

4. Training the Model

Train the model using the padded sequences and one-hot encoded labels. For demonstration, here’s how you would train the LSTM model:

In [None]:
epochs = 10   # Adjust depending on your dataset size and overfitting
batch_size = 32

history = model.fit(X_train, y_train_cat, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test_cat))


You can similarly train the bidirectional model by replacing model with model_bi.

5. Evaluating the Model and Extracting the Fixed-Size Representation

After training, evaluate the model’s performance on the test set. You can also predict on the test set and use the final layer outputs to understand how well your RNN captured the representations.
a. Evaluate Performance

In [None]:
loss, accuracy = model.evaluate(X_test, y_test_cat)
print("Test Accuracy:", accuracy)

b. Making Predictions and Comparing Different Architectures

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Obtain predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Compare to integer encoded true labels
print(classification_report(y_test_encoded, y_pred))


6. Summary and Next Steps

    Tokenization & Padding: Use Keras’ Tokenizer to convert text to sequences and pad them to a fixed length.

    Label Encoding: Convert string labels to integers (and then to one-hot vectors if needed).

    RNN-Based Model:

        Use an Embedding layer followed by an LSTM (or GRU) layer.

        For a fixed-size representation, use the final hidden state of the RNN (or use a Bidirectional RNN which concatenates states).

    Training and Evaluation: Train your model and evaluate using accuracy and detailed classification reports.

    Experiment: Switch between LSTM, GRU, and Bidirectional RNNs, and adjust hyperparameters (layer sizes, dropout rates) to see which architecture performs best on your dataset.

This modular approach lets you easily compare the different RNN architectures and pooling strategies for extracting document-level representations, allowing you to select the best model for your classification task.
