# Starter Notebook

Install and import required libraries

In [1]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msohith-bandari[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes -q
!pip install nvidia-ml-py3 -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25l[?25hdone


In [3]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

## Load Tokenizer and Preprocess Data

In [4]:
!python -m spacy download en_core_web_lg
import spacy
from datasets import load_dataset
from transformers import RobertaTokenizer
import re
import string

# Load models
base_model = 'roberta-base'
spacy_model = "en_core_web_lg"  # Make sure this is installed: python -m spacy download en_core_web_trf
nlp = spacy.load(spacy_model)
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Get all spaCy NER labels and add them as special tokens to the RoBERTa tokenizer
entity_labels = nlp.get_pipe("ner").labels
special_tokens = [f"[{label}]" for label in entity_labels]
num_added_tokens = tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
print(f"Added {num_added_tokens} special tokens to the tokenizer")

# Load dataset
dataset = load_dataset('ag_news', split='train')

# Define preprocessing functions
def replace_entities_with_labels(text):
    """Replace named entities with their label placeholders"""
    doc = nlp(text)
    result = text
    # Process from end to beginning to avoid index shifting
    for ent in reversed(doc.ents):
        start, end = ent.start_char, ent.end_char
        result = result[:start] + f"[{ent.label_}]" + result[end:]
    return result

def preprocess_text(text):
    """Apply preprocessing while preserving entity placeholders"""
    # Identify placeholders
    placeholders = re.findall(r'\[\w+\]', text)
    placeholder_map = {}
    
    # Replace placeholders with unique temporary markers
    for i, ph in enumerate(placeholders):
        temp_marker = f"PLACEHOLDER_{i}"
        text = text.replace(ph, temp_marker)
        placeholder_map[temp_marker] = ph
    
    # Apply preprocessing to the text without placeholders
    text = text.lower()
    # Remove punctuation except for placeholder markers
    text = ''.join([c for c in text if c not in string.punctuation or c == '_'])
    
    # Replace temporary markers back with original placeholders
    for marker, placeholder in placeholder_map.items():
        text = text.replace(marker, placeholder)
    
    return text

def full_preprocess(examples):
    processed_texts = []
    
    for text in examples['text']:
        # Step 1: Replace entities with their labels
        text_with_ent_labels = replace_entities_with_labels(text)
        
        # Step 2: Apply preprocessing while preserving placeholders
        processed_text = preprocess_text(text_with_ent_labels)
        
        processed_texts.append(processed_text)
    
    # Step 3: Apply RoBERTa tokenizer
    tokenized = tokenizer(processed_texts, truncation=True, padding=True)
    return tokenized

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Added 18 special tokens to the tokenizer


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
# # Apply the full preprocessing pipeline
# tokenized_dataset = dataset.map(full_preprocess, batched=True, remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# # Print some statistics
# print(f"Dataset size: {len(tokenized_dataset)}")
# print(f"Number of features: {len(tokenized_dataset.features)}")
# print(f"Sample features: {list(tokenized_dataset.features.keys())}")

In [6]:
# tokenized_dataset.save_to_disk("/kaggle/working/tokenized_ag_news_trf")

In [7]:
# # Create a zip archive of the saved dataset
# import os
# import zipfile
# import shutil
# from pathlib import Path

# output_dir_zip = "/kaggle/working/tokenized_ag_news_trf"
# zip_filename = "tokenized_ag_news_trf.zip"
# with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
#     # Walk through the directory and add all files to the zip
#     for root, dirs, files in os.walk(output_dir_zip):
#         for file in files:
#             # Calculate path relative to the directory we're zipping
#             filepath = os.path.join(root, file)
#             arcname = os.path.relpath(filepath, os.path.dirname(output_dir_zip))
#             zipf.write(filepath, arcname)

# print(f"Dataset compressed to {zip_filename}")

In [8]:
from datasets import load_from_disk
tokenized_dataset = load_from_disk("/kaggle/input/tokenized-data/tokenized_ag_news_lg/tokenized_ag_news_lg")

In [9]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [10]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Anything from here on can be modified

In [11]:
# Split the original training set
tokenized_dataset.save_to_disk("/kaggle/working/tokenized_dataset")
tokenized_dataset = load_from_disk("/kaggle/working/tokenized_dataset")

split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

Saving the dataset (0/1 shards):   0%|          | 0/120000 [00:00<?, ? examples/s]

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [12]:
# PEFT Config
peft_config = LoraConfig(
    r=11,
    lora_alpha=22,
    lora_dropout=0.1,
    bias = 'none',
    target_modules = ['query', 'value'],
    task_type="SEQ_CLS",
)

In [13]:
peft_model = get_peft_model(model, peft_config)

In [14]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 999,172 || all params: 125,647,880 || trainable%: 0.7952


## Training Setup

In [15]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [16]:
config = {
    # Training hyperparameters
    "learning_rate": 5e-5,
    "weight_decay": 0.01,
    "label_smoothing_factor": 0.1,
    "neftune_noise_alpha": 5,
    "per_device_train_batch_size": 16,

    "max_grad_norm": 1.0,

    # Misc
    "run_name": "kaggle_NER_test_claude_minimal_lg",
}

# Initialize wandb with all configuration
wandb.init(
    project="dl-s-25-proj-2", 
    config=config,
    name=config["run_name"],
)

[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250409_072859-eckvlave[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mkaggle_NER_test_claude_minimal_lg[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/sohith-bandari/dl-s-25-proj-2[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/sohith-bandari/dl-s-25-proj-2/runs/eckvlave[0m


In [17]:
# Setup Training args
output_dir = "results"
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    report_to="wandb",
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=5,
    logging_steps=10,
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    max_grad_norm=config["max_grad_norm"],
    # lr_scheduler_type="cosine",     # use linear
    warmup_steps=500,
    # max_steps=1200,
    num_train_epochs=1,
    per_device_train_batch_size=config["per_device_train_batch_size"],
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    fp16=False,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=2,
    run_name=config["run_name"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    label_smoothing_factor=config["label_smoothing_factor"],
    dataloader_pin_memory=True,
    dataloader_persistent_workers=True,
    neftune_noise_alpha=config["neftune_noise_alpha"],
)

In [18]:
def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

### Start Training

In [19]:
peft_lora_finetuning_trainer = get_trainer(peft_model)
result = peft_lora_finetuning_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
50,1.3861,1.387831,0.226562
100,1.378,1.383293,0.226562
150,1.3815,1.37726,0.315625
200,1.3565,1.345268,0.701562
250,0.8861,0.77404,0.851562
300,0.6753,0.660781,0.84375
350,0.7109,0.634287,0.86875
400,0.7289,0.638942,0.859375
450,0.6055,0.626796,0.871875
500,0.5934,0.616058,0.885938


## Evaluate Finetuned Model


### Run Inference on eval_dataset

In [20]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [21]:
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

100%|██████████| 80/80 [00:08<00:00,  9.30it/s]

Evaluation Metric: {'accuracy': 0.8953125}





### Run Inference on unlabelled dataset

In [22]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(full_preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [23]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [02:05<00:00,  7.94it/s]

Inference complete. Predictions saved to inference_output.csv



