# Notebook 3: Prediction (GPU - Using Saved LoRA Adapter)
Purpose:
1. Load tokenizer and LoRA adapter files saved by Notebook 2.
2. Load the base RoBERTa model architecture and weights from Hugging Face Hub.
3. Apply the saved LoRA adapter to the base model.
4. Load and preprocess the unlabeled test data.
5. Run inference on the GPU using a manual prediction loop.
6. Generate the submission file.

In [1]:
# --- Essential Imports ---
import os
import pickle
import gc
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    AutoConfig 
)
from peft import PeftModel, PeftConfig 
from tqdm.auto import tqdm 
import traceback

2025-04-19 03:36:47.295915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745033807.476266      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745033807.527173      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- Configuration ---
saved_model_path = "/kaggle/input/alt/transformers/default/1"

# 2. Path to the original unlabeled test data pickle file
unlabeled_data_path = "/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl"

# --- Model & Tokenizer Settings ---
base_model_name = 'roberta-base'
TOKENIZER_MAX_LENGTH = 512

# --- Prediction Settings ---
PREDICTION_BATCH_SIZE = 16 #

num_labels = 4
id2label = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
label2id = {'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [3]:
# --- GPU Check ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available. Using device: {device}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("WARNING: GPU not available, using CPU. Prediction will be very slow.")

GPU is available. Using device: cuda
GPU Name: Tesla P100-PCIE-16GB


In [4]:
# --- Load Tokenizer ---
print(f"Loading tokenizer from saved path: {saved_model_path}")
try:
    # Check if path exists before loading
    if not os.path.isdir(saved_model_path):
         raise FileNotFoundError(f"Directory not found: {saved_model_path}")
    tokenizer = RobertaTokenizer.from_pretrained(saved_model_path)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load tokenizer from {saved_model_path}: {e}")
    print(f"Ensure the path is correct and contains tokenizer files (vocab.json, merges.txt etc.)")
    raise e

Loading tokenizer from saved path: /kaggle/input/alt/transformers/default/1
Tokenizer loaded successfully.


In [5]:
print("--- Starting Model Loading Process ---")
try:
    # 1. Load base config from HUB, add label info manually
    print(f"Loading base config for {base_model_name} from Hub...")
    config = AutoConfig.from_pretrained(
        base_model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )
    print("Base config loaded and updated with label info.")

    # 2. Load base model weights from HUB, using updated config
    print(f"Loading base model weights for {base_model_name} from Hub...")
    base_model = RobertaForSequenceClassification.from_pretrained(
        base_model_name,
        config=config,
    )
    print("Base model architecture and weights loaded.")

    # 3. Load LoRA adapter weights from the SAVED PATH and apply to base model
    print(f"Loading LoRA adapter weights from: {saved_model_path}")

    model = PeftModel.from_pretrained(base_model, saved_model_path)
    print("PEFT adapter loaded and applied to base model.")

    print("Merging LoRA adapters into base model for faster inference...")
    model = model.merge_and_unload()
    print("LoRA adapters merged and unloaded.")

    # 5. Move model to device and set to evaluation mode
    model.to(device)
    model.eval()
    print(f"Model moved to {device} and set to evaluation mode.")

except Exception as e:
    print(f"ERROR during model loading: {e}")
    print("Check paths, model names, and ensure necessary files (adapter_*, base model cache) are accessible.")
    traceback.print_exc()
    raise e

# Clear memory just in case
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

--- Starting Model Loading Process ---
Loading base config for roberta-base from Hub...


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Base config loaded and updated with label info.
Loading base model weights for roberta-base from Hub...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model architecture and weights loaded.
Loading LoRA adapter weights from: /kaggle/input/alt/transformers/default/1
PEFT adapter loaded and applied to base model.
Merging LoRA adapters into base model for faster inference...
LoRA adapters merged and unloaded.
Model moved to cuda and set to evaluation mode.


In [6]:
# --- Load and Preprocess Unlabeled Data ---
print(f"\nLoading unlabeled data from: {unlabeled_data_path}")
try:
    with open(unlabeled_data_path, "rb") as f:
        test_unlabelled_pickle = pickle.load(f)

    print(f"Type of loaded pickle object: {type(test_unlabelled_pickle)}")

    # Convert pickle content to HF Dataset
    id_source_df = None # For getting original IDs later if needed

    if isinstance(test_unlabelled_pickle, Dataset):
        print("Pickle contains HF Dataset. Using directly.")
        test_unlabelled_dataset_hf = test_unlabelled_pickle
        id_col_present = 'id' in test_unlabelled_dataset_hf.column_names
    elif isinstance(test_unlabelled_pickle, pd.DataFrame):
        print("Pickle contains DataFrame. Converting...")
        id_source_df = test_unlabelled_pickle 
        if 'text' not in id_source_df.columns: raise KeyError("'text' column missing in DataFrame")
        test_unlabelled_dataset_hf = Dataset.from_pandas(id_source_df)
        id_col_present = True 
    elif isinstance(test_unlabelled_pickle, dict):
        print("Pickle contains dict. Converting...")
        if 'text' not in test_unlabelled_pickle: raise KeyError("'text' key missing in dict")
        test_unlabelled_dataset_hf = Dataset.from_dict(test_unlabelled_pickle)
        id_col_present = 'id' in test_unlabelled_dataset_hf.column_names
        # Add IDs if not present
        if not id_col_present:
            test_unlabelled_dataset_hf = test_unlabelled_dataset_hf.add_column("id", range(len(test_unlabelled_dataset_hf)))
            id_col_present = True 
    elif isinstance(test_unlabelled_pickle, list) and len(test_unlabelled_pickle) > 0 and isinstance(test_unlabelled_pickle[0], str):
        print("Pickle contains list of strings. Creating dataset...")
        test_unlabelled_dataset_hf = Dataset.from_dict({"text": test_unlabelled_pickle})
        test_unlabelled_dataset_hf = test_unlabelled_dataset_hf.add_column("id", range(len(test_unlabelled_dataset_hf)))
        id_col_present = True
    else:
        raise TypeError(f"Unsupported data type loaded from pickle: {type(test_unlabelled_pickle)}")

    print(f"Created unlabeled HF Dataset with {len(test_unlabelled_dataset_hf)} examples.")
    print(f"Columns: {test_unlabelled_dataset_hf.column_names}")
    if 'text' not in test_unlabelled_dataset_hf.column_names:
        raise KeyError("Resulting dataset must have a 'text' column for tokenization.")

    # --- Tokenize ---
    print("Tokenizing unlabeled data...")
    def preprocess_unlabelled(examples):
        # Tokenize and return PyTorch tensors, padding handled by DataLoader later if needed
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=TOKENIZER_MAX_LENGTH, return_tensors="pt")

    tokenized_unlabelled = test_unlabelled_dataset_hf.map(
        preprocess_unlabelled,
        batched=True,
        remove_columns=['text'] + (['id'] if id_col_present else []) 
    )

    # Set format to ensure __getitem__ returns tensors
    tokenized_unlabelled.set_format("torch", columns=["input_ids", "attention_mask"])
    print("Unlabeled data tokenized and formatted.")

except FileNotFoundError:
    print(f"ERROR: Unlabeled test data not found at {unlabeled_data_path}.")
    raise
except Exception as e:
    print(f"ERROR loading or preprocessing unlabeled data: {e}")
    traceback.print_exc()
    raise


Loading unlabeled data from: /kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl
Type of loaded pickle object: <class 'datasets.arrow_dataset.Dataset'>
Pickle contains HF Dataset. Using directly.
Created unlabeled HF Dataset with 8000 examples.
Columns: ['text']
Tokenizing unlabeled data...


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Unlabeled data tokenized and formatted.


In [7]:
# --- Manual Prediction Loop ---
print(f"Starting prediction loop with batch size: {PREDICTION_BATCH_SIZE}")
all_preds = []
# Use standard PyTorch DataLoader
pred_dataloader = DataLoader(
    tokenized_unlabelled,
    batch_size=PREDICTION_BATCH_SIZE
)

# Ensure no gradients are computed
with torch.no_grad():
    for batch in tqdm(pred_dataloader, desc="Predicting"):
        # Move batch to the same device as the model
        try:
            # Filter batch to only contain expected model inputs
            model_inputs = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
            if not model_inputs:
                 print(f"Warning: Empty batch after filtering for model inputs? Batch keys: {batch.keys()}")
                 continue
        except AttributeError:
            print(f"ERROR: Error moving batch to device. Batch keys: {batch.keys()}. Ensure data format is torch tensors.")
            raise

        # Get model outputs
        try:
            outputs = model(**model_inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            # Move predictions to CPU and convert to numpy list/array
            all_preds.extend(predictions.cpu().numpy())
        except Exception as e:
            print(f"ERROR during model prediction on batch: {e}")
            traceback.print_exc()
            raise

print(f"Finished prediction loop. Total predictions: {len(all_preds)}")

# Clear cache after prediction
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Starting prediction loop with batch size: 16


Predicting:   0%|          | 0/500 [00:00<?, ?it/s]

Finished prediction loop. Total predictions: 8000


In [9]:
# --- Create Submission File ---
print("Creating submission DataFrame...")
try:
    # Get IDs correctly
    if 'id' in test_unlabelled_dataset_hf.column_names:
         id_series = test_unlabelled_dataset_hf['id']
         # If IDs were loaded as tensors, convert them
         if isinstance(id_series, torch.Tensor): id_series = id_series.numpy()
         elif not isinstance(id_series, (list, np.ndarray, pd.Series)): id_series = list(id_series) # Convert if it's some other iterable
    elif id_source_df is not None: # If original was DataFrame
         id_series = id_source_df.index
    else: 
        id_series = range(len(all_preds))
        print("Using generated sequential IDs for submission.")

    # Ensure lengths match
    if len(id_series) != len(all_preds):
        raise ValueError(f"Mismatch between number of IDs ({len(id_series)}) and predictions ({len(all_preds)}). Check ID extraction logic.")

    submission_df = pd.DataFrame({"ID": id_series, "label": all_preds})

    submission_file = "/kaggle/working/submission15.csv"
    submission_df.to_csv(submission_file, index=False)
    print(f"Submission file saved to {submission_file}")
    print(submission_df.head())

except Exception as e:
    print(f"ERROR during submission file creation: {e}")
    traceback.print_exc()

Creating submission DataFrame...
Using generated sequential IDs for submission.
Submission file saved to /kaggle/working/submission15.csv
   ID  label
0   0      3
1   1      0
2   2      0
3   3      3
4   4      2
