In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from lm_polygraph.estimators import *
from lm_polygraph.utils.model import WhiteboxModel
from lm_polygraph.utils.dataset import Dataset
from lm_polygraph.utils.processor import Logger
from lm_polygraph.utils.manager import UEManager
from lm_polygraph.ue_metrics import PredictionRejectionArea
from lm_polygraph.generation_metrics import RougeMetric, BartScoreSeqMetric, ModelScoreSeqMetric, ModelScoreTokenwiseMetric, AggregatedMetric
from lm_polygraph.utils.builder_enviroment_stat_calculator import (
    BuilderEnvironmentStatCalculator
)
from lm_polygraph.defaults.register_default_stat_calculators import (
    register_default_stat_calculators,
)
from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer
from omegaconf import OmegaConf

# Specify HyperParameters

In [2]:
# model_path = "bigscience/bloomz-560m"
model_path = "meta-llama/Llama-3.1-8B-Instruct"
device = "cuda"
model_type = "Whitebox"
dataset_name = "denis1699/hotpot_cot"
batch_size = 1
seed = 42

# Initialize Model

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    token=os.getenv("HF_TOKEN"),
    device_map=device,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          token=os.getenv("HF_TOKEN")
                                         )
tokenizer.pad_token_id = tokenizer.eos_token_id

model = WhiteboxModel(base_model, tokenizer)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Train and Eval Datasets

In [4]:
# Use validation split, since test split of trivia_qa doesn't have reference answers
dataset = Dataset.load(
    dataset_name,
    'question_with_cot', 'answer',
    batch_size=batch_size,
    prompt="Question: {question_with_cot}\nAnswer:{answer}",
    split="validation"
)
dataset.subsample(16, seed=seed)

train_dataset = Dataset.load(
    dataset_name,
    'question_with_cot', 'answer',
    batch_size=batch_size,
    prompt="Question: {question_with_cot}\nAnswer:{answer}",
    split="train"
)
train_dataset.subsample(16, seed=seed)

# Metric, UE Metric, and UE Methods

In [5]:
ue_methods = [MaximumSequenceProbability(), 
              SemanticEntropy(),
              MahalanobisDistanceSeq("decoder"),
             ]

ue_metrics = [PredictionRejectionArea(), PredictionRejectionArea(max_rejection=0.5)]

# Wrap generation metric in AggregatedMetric, since trivia_qa is a multi-reference dataset
# (y is a list of possible correct answers)
metrics = [AggregatedMetric(RougeMetric('rougeL'))]

loggers = [Logger()] 

# Stat Calculators

In [6]:
TrainingStatistic_config = {
    "dataset": dataset_name,
    "text_column": 'question_with_cot',
    "label_column": 'answer',
    "description": '',
    "prompt": "Question: {question_with_cot}\nAnswer:",
    "few_shot_split": 'train',
    "train_split": 'train',
    "load_from_disk": False,
    "subsample_train_dataset": 10,
    "n_shot": 5,
    "train_dataset": dataset_name,
    "train_test_split": False,
    # needs to be improved to get rid of hardcoded dataset variables.
    "background_train_dataset": "denis1699/hotpot_cot",
    "background_train_dataset_text_column": 'question_with_cot',
    "background_train_dataset_label_column": 'answer',
    "background_train_dataset_data_files": 'train.csv',
    "background_load_from_disk": False,
    "subsample_background_train_dataset": 10,
    "batch_size": 1,
    "size": 16,
    "bg_size": 16,
    "seed": 1
}

In [7]:
# register default stat calculators
result_stat_calculators = dict()
scs = register_default_stat_calculators(model_type)
for sc in scs:
    result_stat_calculators[sc.name] = sc

# register TrainingStatisticExtractionCalculator for the Mahalanobis Distance method
result_stat_calculators.update(
    {
        "TrainingStatisticExtractionCalculator": StatCalculatorContainer(
            name="TrainingStatisticExtractionCalculator",
            cfg=OmegaConf.create(TrainingStatistic_config),
            stats=["train_embeddings", "background_train_embeddings", "train_greedy_log_likelihoods"],
            dependencies=[],
            builder="lm_polygraph.defaults.stat_calculator_builders.default_TrainingStatisticExtractionCalculator",
        )
    }
)
    
builder_env_stat_calc = BuilderEnvironmentStatCalculator(model=model)
available_stat_calculators = list(result_stat_calculators.values())

# Manager

In [8]:
man = UEManager(
    data=dataset,
    model=model,
    estimators=ue_methods,
    builder_env_stat_calc=builder_env_stat_calc,
    available_stat_calculators=available_stat_calculators,
    generation_metrics=metrics,
    ue_metrics=ue_metrics,
    processors=loggers,
    ignore_exceptions=False,
    max_new_tokens=64
)

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Compute Results

In [9]:
results = man()

  0%|                                                    | 0/16 [00:00<?, ?it/s]
[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[A[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s]


[A[ASetting `pad_toke

In [10]:
for key in results.keys():
    print(f"UE Score: {key[1]}, Metric: {key[2]}, UE Metric: {key[3]}, Score: {results[key]:.3f}")

UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.702
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.225
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.627
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: -0.209
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.652
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: -0.063
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: 0.675
UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: 0.095
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: 0.650
UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: -0.115
UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metri