In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import os

In [2]:
train_df = pd.read_csv('arxiv_train.csv')
test_df = pd.read_csv('arxiv_test.csv')

In [3]:
train_df.dropna(subset=['abstract', 'label'], inplace=True)
test_df.dropna(subset=['abstract', 'label'], inplace=True)

In [None]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.lower()
    return text

train_df['clean_abstract'] = train_df['abstract'].apply(clean_text)
test_df['clean_abstract'] = test_df['abstract'].apply(clean_text)


In [5]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")
print(f"Classes: {label_encoder.classes_}")

Number of classes: 10
Classes: ['astro-ph' 'cond-mat' 'cs' 'eess' 'hep-ph' 'hep-th' 'math' 'physics'
 'quant-ph' 'stat']


In [6]:
glove_path = 'glove.6B.100d.txt'
if not os.path.exists(glove_path):
    # If file doesn't exist, provide instructions to download
    print("Please download GloVe embeddings from: https://nlp.stanford.edu/data/glove.6B.zip")
    print("Extract and place glove.6B.100d.txt in your working directory")
    # Alternatively, you could automate this with requests or wget
else:
    print("GloVe embeddings found.")

GloVe embeddings found.


In [7]:
def load_glove_embeddings(path):
    """Load GloVe embeddings from file."""
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Found {len(embeddings_index)} word vectors.")
    return embeddings_index

In [8]:
def text_to_embedding(text, embeddings_index, embedding_dim=100):
    """Convert text to embedding by averaging word vectors."""
    words = text.lower().split()
    embedding = np.zeros(embedding_dim)
    word_count = 0
    
    for word in words:
        if word in embeddings_index:
            embedding += embeddings_index[word]
            word_count += 1
    
    if word_count > 0:
        embedding /= word_count
    
    return embedding

In [9]:
embeddings_index = load_glove_embeddings(glove_path)

Loading GloVe embeddings...
Found 400000 word vectors.


In [10]:
X_train = np.array([text_to_embedding(abstract, embeddings_index) 
                    for abstract in train_df['clean_abstract']])
X_test = np.array([text_to_embedding(abstract, embeddings_index) 
                   for abstract in test_df['clean_abstract']])

print(f"Training feature shape: {X_train.shape}")
print(f"Testing feature shape: {X_test.shape}")

Training feature shape: (80000, 100)
Testing feature shape: (20000, 100)


In [11]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(100,)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [13]:
model.summary()

In [14]:
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    verbose=1
)

Epoch 1/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.4395 - loss: 1.5496 - val_accuracy: 0.6625 - val_loss: 0.9813
Epoch 2/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6290 - loss: 1.0590 - val_accuracy: 0.6793 - val_loss: 0.9199
Epoch 3/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6503 - loss: 0.9959 - val_accuracy: 0.6762 - val_loss: 0.9169
Epoch 4/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6603 - loss: 0.9730 - val_accuracy: 0.6981 - val_loss: 0.8687
Epoch 5/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.6688 - loss: 0.9492 - val_accuracy: 0.6984 - val_loss: 0.8582
Epoch 6/10
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6749 - loss: 0.9394 - val_accuracy: 0.6967 - val_loss: 0.8595
Epoch 7/10
[1m2

In [15]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6994 - loss: 0.8364
Test accuracy: 0.7015


In [16]:
import sklearn.metrics as metrics
y_pred = np.argmax(model.predict(X_test), axis=1)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [17]:
# Calculate metrics
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Print macro F1 and weighted F1 for comparison with your first step
print(f"Macro F1: {metrics.f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"Weighted F1: {metrics.f1_score(y_test, y_pred, average='weighted'):.4f}")

Classification Report:
              precision    recall  f1-score   support

    astro-ph       0.81      0.87      0.84      2013
    cond-mat       0.65      0.66      0.66      2058
          cs       0.63      0.59      0.61      1995
        eess       0.64      0.72      0.68      1948
      hep-ph       0.84      0.69      0.76      1990
      hep-th       0.65      0.80      0.72      2019
        math       0.75      0.77      0.76      2042
     physics       0.63      0.39      0.48      1977
    quant-ph       0.77      0.74      0.76      2013
        stat       0.65      0.77      0.70      1945

    accuracy                           0.70     20000
   macro avg       0.70      0.70      0.70     20000
weighted avg       0.70      0.70      0.70     20000

Macro F1: 0.6965
Weighted F1: 0.6970
