In [None]:
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, TaskType, get_peft_model

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, TaskType, get_peft_model

import torch
from sklearn.metrics import accuracy_score
from transformers import DataCollatorWithPadding

from datasets import load_from_disk, concatenate_datasets, Dataset
from data_utils import compile_DatasetFrames

## Preliminary Analysis:
- $n \approx 15,000$ (slightly low)
- $N_{str} \leq 1,600$ (low-ish maximum number of characters considered)
- embeddings (if used) from `gist` (moderately sized model)

### Load dataset

In [None]:
# load frames
df_train, df_test, df_val = compile_DatasetFrames()

# convert to datasets
dset_train = Dataset.from_pandas(df_train)
#dset_test = Dataset.from_pandas(df_test) # ignore test for a bit
dset_val = Dataset.from_pandas(df_val)

### Model: `peft`

In [None]:
# Config
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large")

In [None]:
# load model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# Step 1: Load the tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

# Step 2: Tokenize the dataset
dset_train_encoded = dset_train.map(preprocess_function, batched=True)
dset_val_encoded = dset_val.map(preprocess_function, batched=True)

# Remove unnecessary columns
dset_train_encoded = dset_train_encoded.remove_columns(['text', 'embeddings', 'path', 'abstract', 'firstpage', '__index_level_0__'])
dset_val_encoded = dset_val_encoded.remove_columns(['text', 'embeddings', 'path', 'abstract', 'firstpage', '__index_level_0__'])

# Set the label column
dset_train_encoded = dset_train_encoded.rename_column('journal_cls', 'labels')
dset_val_encoded = dset_val_encoded.rename_column('journal_cls', 'labels')

In [None]:
# Set format for PyTorch
dset_train_encoded.set_format("torch")
dset_val_encoded.set_format("torch")

# Step 3: Load the model
num_labels = len(set(dset_train['journal_cls']))  # Number of unique classes
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Step 4: Set up LoRA config
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,
    r=8,  # The rank of the low-rank matrices
    lora_alpha=32,
    lora_dropout=0.1
)

# Step 5: Apply LoRA to the base model using PEFT
model = get_peft_model(base_model, lora_config)

# Data collator (for dynamic padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 6: Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# Step 7: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dset_train_encoded,
    eval_dataset=dset_val_encoded,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 9: Train the model with LoRA applied
trainer.train()