# Final Notebook

Contributors:

Agustin Leon Nunez (al8937)

Akhil Manoj (am14580)

Anup Raj Niroula (arn8147)

Install and import required libraries

In [None]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install scikit-learn
!pip install torch
!pip install numpy
!pip install tensorboard

In [None]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel, concatenate_datasets
import pickle
import numpy

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

torch.cuda.empty_cache()


# Basic Configuration Initialization for various components of the project

Here, the configuration for `LoraConfig`, `TrainingArgument` and whether to set the Data Augmentation Flags for the rest of the experiment.

In [None]:
# Step 0: Parameters

# Checkpointing
CHECKPOINT_DIR = "checkpoints"
START_FROM = "best"  # Options: "last", "best", or "none"

## Deprecated
# Fine-tuning GPT2 on target domain so it learns test-time features.
ALTERNATIVE_CONFIG = False
## Deprecated


#-------------------------------------------------------
# Data Pre-processing and Augmentation Configuration
#-------------------------------------------------------

# Data Augmentation

# Tokenization & Data
VALIDATION_SIZE = 0.1
SEED = 42
NUM_WORKERS = 4

# Data Filtering
USE_DATA_FILTERING = True
DATA_FILTER_MAX_LENGTH = 512
DATA_FILTER_MIN_LENGTH = 1

#Data Augmentation
USE_AUGMENTED_SAMPLES = True

# Model Configuration

## Deprecated
# Fine-tuning GPT2 on target domain so it learns test-time features.
#Knowledge Distillation
TRAIN_TEACHER_KD = False      #Train the Teacher
TRANSFER_LEARNING_KD = False # Once Teacher and Student are ready, set to True to transfer learning
## Deprecated


# Domain-adaptive LLM Data Augmentation of AGNEws
USE_LLM_DOMAIN_AUGMENTATION = False  # Toggle ON/OFF
LLM_MODEL_NAME = "gpt2"              # Can be "gpt2", "EleutherAI/gpt-neo-125M", etc.
NUM_LLM_GENERATIONS_PER_CLASS = 3000  # Samples to generate per AGNews class
APPLY_LLM_DOMAIN_AUG = True # Set to true when you want to implement the augmentation
# ADAPTED_LLM_PATH = "adapted_llm"
ADAPTED_LLM_PATH = "./adapted_llm"

# Loading Pre-trained model for continued training (if needed)
LOAD_FROM_PRETRAINED = False
PRETRAINED_MODEL = "<model-path>"

#-------------------------------------------------------
# Model Hyperparameters
#-------------------------------------------------------
MODEL_NAME = "roberta-base"

# ModelTraining Parameters
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

OPTIMIZER = "adamw_torch_fused"
# MAX_STEPS = 1200
EVAL_STRATEGY = "epoch"
LOGGING_STEPS = 100
GRADIENT_CHECKPOINTING = True
GRADIENT_ACCUM_STEPS = 1

# LORA Configuration
LORA_RANK = 4
LORA_ALPHA = 64
LORA_DROPOUT = 0.1
LORA_TARGET_MODULES = ['query', 'key', 'value']
LORA_BIAS = 'all'
OUTPUT_DIR = "./results"

# Augmented Dataset Specification
AUGMENTED_DATASET_PATH = "./aug_llm_data" #Set correct directory and name
AUGMENTED_DATASET_TEXT_COLUMN = "text" #Set to proper name
AUGMENTED_DATASET_LABEL_COLUMN = "label" #Set to proper name

# Data Augmentation

## Domain Adaptation of the AG-NEWS dataset
To get the distribution of the unlabelled dataset(target dataset) in the AG-NEWS dataset(source dataset), we opted to implement target-aware data augmentation. 
To achieve this, we fine-tuned an LLM model (GPT2) on target dataset and use that to generate synthetic training examples in the target dataset style

In [None]:
if USE_LLM_DOMAIN_AUGMENTATION:
    import pandas as pd
    from transformers import AutoTokenizer
    from datasets import Dataset

    # Load unlabeled data from CSV
    unlabelled_df = pd.read_csv("./test_unlabelled.csv")  # Update path as needed

    # Convert to Hugging Face Dataset
    unlabelled_dataset = Dataset.from_pandas(unlabelled_df)

    # Load tokenizer and set pad token
    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
    if llm_tokenizer.pad_token is None:
        llm_tokenizer.pad_token = llm_tokenizer.eos_token

    # Tokenization function
    def tokenize_for_lm(example):
        return llm_tokenizer(
            example["text"],
            truncation=True,
            padding="max_length",
            max_length=DATA_FILTER_MAX_LENGTH
        )

    # Tokenize the dataset
    tokenized_unlabelled = unlabelled_dataset.map(tokenize_for_lm, batched=True)

#### Training LLM model for domain adaptatation

In [None]:
if USE_LLM_DOMAIN_AUGMENTATION:
    from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

    # Load model
    lm_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=ADAPTED_LLM_PATH,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        fp16=True,
        overwrite_output_dir=True,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=lm_model,
        args=training_args,
        train_dataset=tokenized_unlabelled,
        data_collator=DataCollatorForLanguageModeling(llm_tokenizer, mlm=False)
    )

    # Train and Save
    trainer.train()
    trainer.save_model(ADAPTED_LLM_PATH)
    llm_tokenizer.save_pretrained(ADAPTED_LLM_PATH)

## Data Augmentation

#### Load Tokenizer and Preprocess Data

In [4]:
# Step 1: Load Tokenizer and Preprocess Data

base_model = MODEL_NAME

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from transformers import pipeline, set_seed
from datasets import Dataset
import random
import numpy as np
import time

if USE_LLM_DOMAIN_AUGMENTATION:
    print("Performing LLM-based domain augmentation...")

    # Generation Parameters
    NUM_LLM_GENERATIONS_PER_CLASS = 3000
    BATCH_SIZE = 50
    MAX_LENGTH = 128
    REPETITION_PENALTY = 1.3  # Increase to discourage repetitive output

    set_seed(42)

    # Load the LLM pipeline
    generator = pipeline("text-generation", model=ADAPTED_LLM_PATH, device=0)

    # Diverse prompts for each AGNews class
    AGNEWS_PROMPTS = {
        0: [
            "Write a short article about global affairs:",
            "Describe a recent international political event:",
            "Report on a global crisis:",
            "Summarize a UN meeting:",
            "Headline news from Europe:",
            "Describe a geopolitical conflict:",
            "World headline story:",
            "Write a short report about international tensions:",
            "What is happening globally today?",
            "Write a short update on international relations:",
            "Write a brief article about world diplomacy:",
            "Cover a story from Asia or Africa:",
            "Cover a story from Middle East or Western Asia:",
            "What is the latest in foreign policy?",
            "Write a short international incident report:",
            "Describe a breaking world news story:"
        ],
        1: [
            "Write a short sports update:",
            "Describe a recent football match:",
            "Write a short NBA game report:",
            "Report on a tennis tournament:",
            "Write about a famous athlete's performance:",
            "Describe a recent soccer win:",
            "Write a short piece about a sports upset:",
            "What happened in sports this week?",
            "Summarize a recent game:",
            "Short sports commentary:",
            "Who won the match today?",
            "Write a sports new based in Africa",
            "Write a sports new based in Southeast Asia",
            "Write a sports new based in the US",
            "Write a sports new based in South America",
            "Describe a record-breaking sports moment:",
            "Write a recap of a boxing match:",
            "Short news on Olympic sports:",
            "Write a short report on a sports scandal:"
        ],
        2: [
            "Write a short business article:",
            "Describe a stock market change:",
            "Report on a major company merger:",
            "Summarize an economic trend:",
            "What is happening in finance today?",
            "Write a short article about corporate news:",
            "Describe a recent economic development:",
            "Write about business earnings reports:",
            "Cover a tech startup acquisition:",
            "Write about inflation or interest rate news:",
            "Describe a company going public:",
            "Summarize a business scandal:",
            "Write a headline about Wall Street:",
            "Write about a global economic issue:",
            "Write about a business article on the sports industry:",
            "Write about a business article on the sports industry:",
            "Write about a business article on the sports industry:",
            "Short report on trade and markets:"
        ],
        3: [
            "Write a short tech article:",
            "Describe a recent space discovery:",
            "What's new in artificial intelligence?",
            "Write a short article about scientific progress:",
            "Summarize a new research paper:",
            "Write about a medical breakthrough:",
            "Describe a new tech gadget:",
            "Write about a NASA update:",
            "Report on an academic conference:",
            "Cover a story about climate science:",
            "Write about electric vehicles innovation:",
            "Describe a recent invention:",
            "Summarize a finding in biology:",
            "What's trending in technology?",
            "Write a tech or science article with business related words",
            "Write a tech or science article with business related words",
            "Write a tech article from someone outside of the US",
            "Write about computer science progress:"
        ]
    }

    # Word-length based sampling for variable output length
    def sample_max_length():
        # Draw from a normal distribution centered around 45, std dev 12
        sampled = int(np.random.normal(loc=45, scale=12))
        # Clamp values to ensure theyâ€™re between 1 and 128
        return max(1, min(MAX_LENGTH, sampled))

    synthetic_samples = []

    for label, prompt_list in AGNEWS_PROMPTS.items():
        print(f"\n Generating {NUM_LLM_GENERATIONS_PER_CLASS} samples for label {label}...")

        for i in range(0, NUM_LLM_GENERATIONS_PER_CLASS, BATCH_SIZE):
            current_batch_size = min(BATCH_SIZE, NUM_LLM_GENERATIONS_PER_CLASS - i)
            prompts = random.choices(prompt_list, k=current_batch_size)
            max_len = sample_max_length()

            start = time.time()
            outputs = generator(
                prompts,
                max_length=max_len,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                truncation=True,
                repetition_penalty=REPETITION_PENALTY
            )

            # Remove the prompt from each output
            for prompt, generated_list in zip(prompts, outputs):
                for out in generated_list:  # iterate over outputs per prompt
                    text = out["generated_text"]
                    cleaned = text.replace(prompt, "").strip()
                    synthetic_samples.append({"text": cleaned, "label": label})

            print(f"   Batch {i + current_batch_size}/{NUM_LLM_GENERATIONS_PER_CLASS} in {time.time() - start:.2f}s")

    # Save as Hugging Face dataset
    aug_llm_dataset = Dataset.from_list(synthetic_samples)
    aug_llm_dataset = aug_llm_dataset.cast_column("label", dataset.features["label"])
    aug_llm_dataset.save_to_disk("./aug_llm_data")

    print(" Saved to disk at: aug_llm_dataset_cleaned")

##### Exploring the Generated Synthetic Data 

In [None]:
# Explore and check the generated augmented samples. We will print 5 of each class.

import random

# AGNews label names (optional, for nicer output)
label_names = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Make sure the dataset is shuffled first (if not already)
aug_llm_dataset = aug_llm_dataset.shuffle(seed=42)

# For each label (0 to 3), print 5 samples
for label in range(4):
    print(f"\nðŸ“° Label {label}: {label_names[label]}")
    samples = [ex["text"] for ex in aug_llm_dataset if ex["label"] == label]
    random_samples = random.sample(samples, k=min(5, len(samples)))  # just in case <5 exist
    for i, sample in enumerate(random_samples, 1):
        print(f"\nSample {i}:\n{sample.strip()}")

#### Combine the synthetic dataset with AG_NEWS dataset

In [7]:
if APPLY_LLM_DOMAIN_AUG:

  from datasets import load_from_disk, concatenate_datasets

  # Load AGNews
  from datasets import load_dataset
  agnews_dataset = load_dataset("ag_news", split="train")

  # Load your saved generated data
  aug_llm_dataset = load_from_disk("./aug_llm_data")

  # Concatenate
  combined_dataset = concatenate_datasets([agnews_dataset, aug_llm_dataset])
  dataset = combined_dataset.shuffle(seed=42)

# Model Training

## Model and Data Tokenizer Initialization

In [None]:
def filter_dataset(dataset, min_len=DATA_FILTER_MIN_LENGTH, max_len=DATA_FILTER_MAX_LENGTH):
    def is_valid(example):
        length = len(example["text"].split())
        return min_len <= length <= max_len
    return dataset.filter(is_valid)


# Optional: if we are using data filtering
if USE_DATA_FILTERING:
    print("Filtering dataset based on text length...")
    dataset = filter_dataset(dataset, max_len=DATA_FILTER_MAX_LENGTH)


def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")


# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


# Load Pre-trained Model

model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, id2label=id2label)

model


Filtering dataset based on text length...


Filter:   0%|          | 0/132000 [00:00<?, ? examples/s]

Map:   0%|          | 0/132000 [00:00<?, ? examples/s]

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Data Splitting

In [None]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=VALIDATION_SIZE, seed=SEED)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
# Step 2: Setup LoRA Config

# PEFT Config
# PEFT Config
peft_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias=LORA_BIAS,
    target_modules=LORA_TARGET_MODULES,
    task_type="SEQ_CLS",
)

peft_model = get_peft_model(model, peft_config)

print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 916,996 || all params: 125,463,560 || trainable%: 0.7309


## Training Setup

In [None]:
# Step 3: Training setup

# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }


# Setup training args

print("TrainingArguments comes from:", TrainingArguments.__module__)

# Setup Training args
output_dir = "results"
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=12,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*4,
    learning_rate=LEARNING_RATE,
    eval_strategy=EVAL_STRATEGY,               # Evaluate every epoch
    save_strategy="epoch",                     # Save checkpoint every epoch
    load_best_model_at_end=True,               # Load best checkpoint automatically
    metric_for_best_model="accuracy",          # Use accuracy to determine best model
    greater_is_better=True,                    # Higher accuracy is better
    save_total_limit=2,                        # Keep only last 2 checkpoints
    logging_dir="./logs",                      # For log storage
    logging_steps=LOGGING_STEPS,
    report_to="tensorboard",
    optim="adamw_torch_fused",
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    dataloader_num_workers=NUM_WORKERS,
    gradient_accumulation_steps= GRADIENT_ACCUM_STEPS,
    fp16 = True,
    warmup_ratio = WARMUP_RATIO,
    weight_decay= WEIGHT_DECAY
)

def get_trainer(model):
      return Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator
      )


TrainingArguments comes from: transformers.training_args


## Evaluate Finetuned Model


In [15]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

### Training the model
For the best model, we ran the training for 11 epochs in 2 parts
- First for 8 epochs
- Second for 3 epochs

In [None]:
trainer = get_trainer(peft_model)  # Make sure get_trainer(...) uses correct TrainingArguments

import time

# Train and evaluate using Hugging Face
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    start_time = time.time()

    trainer.train(resume_from_checkpoint=None)  # or pass checkpoint path if needed
    trainer.save_model(MODEL_NAME + "_model")
    trainer.save_state()  # optional: saves optimizer/scheduler/training state

    epoch_time = time.time() - start_time
    print(f"Epoch time: {epoch_time:.2f} seconds")

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Epoch 1/10


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2072,0.210628,0.927045
2,0.2126,0.204843,0.928409
3,0.2047,0.206081,0.928333


KeyboardInterrupt: 

### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
# def classify(model, tokenizer, text):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
#     output = model(**inputs)

#     prediction = output.logits.argmax(dim=-1).item()

#     print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
#     return id2label[prediction]

In [None]:
# classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
# classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
# Check evaluation accuracy
# _, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

### Run Inference on unlabelled dataset

In [20]:
#Load your unlabelled data
def preprocess_inference(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",  # ensure uniform input size
        max_length=512         # match training size
    )


unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess_inference, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [23]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:28<00:00, 34.65it/s]

Inference complete. Predictions saved to inference_output.csv





### Saving the Model at last

In [24]:
# Saving the model and tokenizer
peft_model.save_pretrained("./best-model")
tokenizer.save_pretrained("./best-model")

('./best-model/tokenizer_config.json',
 './best-model/special_tokens_map.json',
 './best-model/vocab.json',
 './best-model/merges.txt',
 './best-model/added_tokens.json')