In [3]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import transformers

In [4]:
# Assuming your filtered dataset CSV (form pross_date.py) is saved here:
data_path = "../data/quantum_subset.csv"
df = pd.read_csv(data_path)

#Keep only neccessary columns 
df = df[['title', 'categories']]

# encode catagories
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['lable'] = le.fit_transform(df['categories'])

# Split 
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['title'].tolist(), df['lable'].tolist(), test_size=0.2, random_state=42
)


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)


In [6]:
class ArxivDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = ArxivDataset(train_encodings, train_labels)
val_dataset = ArxivDataset(val_encodings, val_labels)


In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/category_classifier/",
    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_dir="../evaluation/",

)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,6.1862
1000,5.6091
1500,4.9827
2000,4.8449




TrainOutput(global_step=2000, training_loss=5.405738525390625, metrics={'train_runtime': 4206.8363, 'train_samples_per_second': 3.803, 'train_steps_per_second': 0.475, 'total_flos': 536753547264000.0, 'train_loss': 5.405738525390625, 'epoch': 2.0})

In [10]:


model.save_pretrained("../models/category_classifier/")
tokenizer.save_pretrained("../models/category_classifier/")

('../models/category_classifier/tokenizer_config.json',
 '../models/category_classifier/special_tokens_map.json',
 '../models/category_classifier/vocab.txt',
 '../models/category_classifier/added_tokens.json')