In [None]:
from transformers import LlamaForSequenceClassification, AutoTokenizer, Trainer, DataCollatorWithPadding
from datasets import load_dataset, Dataset
from peft import PeftModel

In [20]:
datasets = load_dataset("./datasets--potsawee--wiki_bio_gpt3_hallucination", trust_remote_code=True)
dataset = datasets["test"]
dataset

Dataset({
    features: ['gpt3_text', 'wiki_bio_text', 'gpt3_sentences', 'annotation', 'wiki_bio_test_idx', 'gpt3_text_samples'],
    num_rows: 238
})

In [21]:
dataset_dict = {}
hypothesis = []
premise=[]
label=[]
for i in range(len(dataset)):
    for j in range(len(dataset["gpt3_sentences"][i])):
        hypothesis.append(dataset["gpt3_sentences"][i][j])
        premise.append(dataset["wiki_bio_text"][i])
        label.append(dataset["annotation"][i][j])

dataset_dict["hypothesis"] = hypothesis
dataset_dict["premise"] = premise
dataset_dict["label"] = label

dataset = Dataset.from_dict(dataset_dict)
dataset


Dataset({
    features: ['hypothesis', 'premise', 'label'],
    num_rows: 1908
})

In [27]:
tokenizer = AutoTokenizer.from_pretrained("./llama-3.2-1B/")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

model = LlamaForSequenceClassification.from_pretrained("./llama-3.2-1B/", num_labels=3) # bnb
model.config.pad_token_id = tokenizer.pad_token_id
p_model = PeftModel.from_pretrained(model, model_id="./checkpoints/checkpoint-24471")
merge_model = p_model.merge_and_unload()

str2int = {'major_inaccurate': 2, 'accurate': 0, 'minor_inaccurate': 1}

print(type(tokenizer))
def process_function(example):
    prompt = f"This is an NLI task. I'll give you two sentences which are the premise and the hypothesis respectively.\n \
              If the hypothesis can be inferred from the premise, the answer is entailment.\n \
              If the hypothesis is inconsistent with the premise, the answer is contradiction.\n \
              If the hypothesis is unrelated to premise, the answer is neutral.\n\n \
              ---\n \
              Important Note: You can only response ONE word among entailment, neutral and contradiction.\n \
              ---\n\n \
              The premise: { example['premise'] }\n \
              The hypothesis: { example['hypothesis'] }\n \
              The answer: "
    
    tokenized_example = tokenizer(text=prompt, max_length=256, padding="max_length")
    tokenized_example["labels"] = str2int[example["label"]]
    return tokenized_example

tokenized_dataset = dataset.map(process_function, batched=False, remove_columns=dataset.column_names)
tokenized_dataset

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./llama-3.2-1B/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>


Map:   0%|          | 0/1908 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1908
})

In [None]:
# use trainer as a predictor

trainer = Trainer(model=merge_model,
                  tokenizer=tokenizer,
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
                  )

predictions = trainer.predict(tokenized_dataset)   #1

predicted_labelsint = [prediction.argmax(axis=-1) for prediction in predictions.predictions]



tp = 0
tn = 0
fp = 0
fn = 0  # p:0, n:1/2
for pred_label, true_label in zip(predicted_labelsint, tokenized_dataset['labels']):
    if pred_label in [1, 2] and true_label in [1, 2]:   # non-factual for (0.5 and 1)
        tn = tn + 1
    elif pred_label == true_label:  # both 0, factual
        tp = tp + 1
    elif true_label == 0:
        fn = fn + 1
    else:   # true_label != 0
        fp = fp + 1
precision = float(tp)/(tp+fp)
recall = float(tp)/(tp+fn)
f1 = (2 * precision * recall) / (precision + recall)

acc = float(tp+tn)/len(predicted_labelsint)
print(f"tn={tn}, tp={tp}, fn={fn}, fp={fp}")
print(f"acc: {acc}")
print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"f1: {f1}")


  trainer = Trainer(model=merge_model,


acc: 0.7751572327044025
precision: 0.8085106382978723
recall: 0.22093023255813954
f1: 0.3470319634703196
