In [1]:
import os
os.environ['HF_HOME'] = '/home/sa5u24/safe_lora'
hf_home = os.path.expanduser(
    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
print(hf_home)

from huggingface_hub import login

# Replace 'your-hf-token-here' with your actual Hugging Face token
login(token="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")

/home/sa5u24/safe_lora


In [2]:
from datasets import load_dataset

dataset = load_dataset("knkarthick/dialogsum")
dataset_val = dataset['test']
print(len(dataset_val))

1500


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-chat-hf"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",
            # attn_implementation="flash_attention_2", # not supported for training
            torch_dtype=torch.bfloat16,
            quantization_config=bnb_config)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token 

lora_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
)

model_lora_v0 = get_peft_model(model, lora_config)

#load the lora weights
lora_path = "/home/sa5u24/safe_lora/fine-tuned-llama-original/checkpoint-480"
model_lora = PeftModel.from_pretrained(model_lora_v0, lora_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def generate_prompt_qa(example):
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""
    
    return f"### Context:Summarize the following dialogue as briefly and precisely as possible. Focus only on the main points and avoid unnecessary details: \n### Dialogue:\n{example['dialogue']}\n\n### Summary:\n"


def preprocess_function(model, example: dict, tokenizer=tokenizer, max_length=512, mask_inputs: bool = True):
    
    """Processes a single sample."""
    inputs = generate_prompt_qa(example)
    # print("inputs", inputs)
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, return_tensors="pt")
    
    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_length)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = tokenizer.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text
      

In [5]:
import evaluate
import torch
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import requests
from torch import nn


all_pred = []
all_ans = []
i=0
model.eval()
with torch.no_grad():
    for sample in dataset_val:
        output = preprocess_function(model_lora, sample)
        ans = sample['summary']
        all_pred.append(output)
        all_ans.append(ans)
        i+= 1
        if i>49:
            break

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [6]:
len(all_pred), 

(50,)

In [7]:
for i in range(len(all_pred)):
    print("pred:", all_pred[i])
    print("ans:", all_ans[i])

pred: Ms. Dawson is requested to take dictation for a memo that outlines a new office policy restricting all forms of communication to email and official memos. The use of Instant Messaging is prohibited during working hours, and any employee found violating the policy will face consequences.
ans: Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.
ans: In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.
pred: The manager, #Person1#, is implementing a new policy that restricts all office communications to email and official memos. Instant Messaging is prohibited during working hours, and any employee found using it will face consequences. The policy applies to both internal and external communications.
ans: Ms. Dawson take

In [8]:
from evaluate import load

# Load the metrics
meteor = load("meteor")
bleu = load("bleu")
rouge = load("rouge")

rouge_results = rouge.compute(predictions=all_pred, references=all_ans)
bleu_result = bleu.compute(predictions=all_pred, references=all_ans)
meteor_result = meteor.compute(predictions=all_pred, references=all_ans)

print("ROUGE:", rouge_results)
print("BLEU:", bleu_result)
print("METEOR:", meteor_result)

[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sa5u24/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


ROUGE: {'rouge1': 0.24854578091593932, 'rouge2': 0.06791501550732229, 'rougeL': 0.1853941041774689, 'rougeLsum': 0.1845013078781948}
BLEU: {'bleu': 0.0529570431734555, 'precisions': [0.20093457943925233, 0.0746775288526816, 0.03729281767955801, 0.014054813773717497], 'brevity_penalty': 1.0, 'length_ratio': 2.3891547049441786, 'translation_length': 2996, 'reference_length': 1254}
METEOR: {'meteor': 0.3236960207287929}


In [8]:
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions= all_pred, references=all_ans)
print(rouge_results)

{'rouge1': 0.26203009878637495, 'rouge2': 0.05205727873581786, 'rougeL': 0.19660937352957403, 'rougeLsum': 0.1967861105991594}


In [9]:
bleu_score = corpus_bleu(all_ans, all_pred, weights=(1.0, 0.0, 0.0, 0.0))
print(bleu_score)

0.17524626628535114


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [10]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

m_score=0
for line in zip(all_ans, all_pred):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)
meteors = m_score/len(all_ans)
print(meteors)

[nltk_data] Downloading package wordnet to /home/sa5u24/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sa5u24/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0.22623503400948844
