# Notebook is used for quick prototyping and testing of parameters

In [2]:
from torch.nn.utils.rnn import pad_sequence
import torch

### Dummy data

In [3]:
texts = [
    "This paper explores deep reinforcement learning applications in robotics.",
    "We investigate new quantum algorithms for physics simulations.",
    "This study presents a new architecture for convolutional neural networks."
]
labels = [0, 1, 0]  # Simulate categories like [CS, Physics, CS]

### Simulated vocab for testing (word -> index)

In [4]:
vocab = {
    "this": 1, "paper": 2, "explores": 3, "deep": 4, "reinforcement": 5, "learning": 6,
    "applications": 7, "in": 8, "robotics": 9, "we": 10, "investigate": 11, "new": 12,
    "quantum": 13, "algorithms": 14, "for": 15, "physics": 16, "simulations": 17
}

### Simple RNN preprocessing

In [None]:
def simple_tokenizer(text):
    return [vocab.get(word.lower(), 0) for word in text.split()]

tokenized = [torch.tensor(simple_tokenizer(t)) for t in texts]
padded = pad_sequence(tokenized, batch_first=True)
labels_tensor = torch.tensor(labels)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load original data
df = pd.read_csv("../../Data/full_arxiv.csv")  # or use the full path "../Data/arxiv_train.csv"

# Split features and labels
X = df.drop(columns=['label'])
y = df['label']

# Stratified 80-20 split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Recombine and save to CSV
train_df = X_train.copy()
train_df['label'] = y_train.values
train_df.to_csv("arxiv_train2.csv", index=False)

val_df = X_val.copy()
val_df['label'] = y_val.values
val_df.to_csv("arxiv_val.csv", index=False)
