In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[0m

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# The model that you want to train from the Hugging Face hub
base_model = "NousResearch/llama-2-7b-chat-hf"
tuned_model = "minhajgc14/fine-tuned-llama"

In [None]:
compute_dtype = getattr(torch, "float16")

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
untuned_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
fine_tuned_model = PeftModel.from_pretrained(untuned_model, tuned_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Why does the AI Policy restrict some types of information from disclosure?"
pipe = pipeline(task="text-generation", model=fine_tuned_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Why does the AI Policy restrict some types of information from disclosure? [/INST] The AI Policy restricts certain types of information from disclosure to protect sensitive information, maintain the confidentiality of personal information, and avoid harm to the Bank or its stakeholders. It is essential to maintain the integrity of the Bank's operations and protect the privacy and security of its clients and employees. The AI Policy outlines the types of information that are exempt from disclosure, including personal information, financial information, personal communications, and other sensitive information. It is essential to protect the privacy and security of the Bank's clients and employees and maintain the integrity of its operations. The AI Policy outlines the types of information that are exempt from disclosure, including personal information, financial information, personal communications, and other sensitive information. It is essential to protect the privacy and security of t

## Benchmarking against our own dataset using BLEU and ROUGE matrices

In [None]:
import random
import json
from datasets import load_metric
import sacrebleu

# Load the dataset
with open('/content/waseem/questions/questions.json', 'r') as file:
    full_data = json.load(file)

# Get the total number of rows in the data
num_rows = len(full_data)

# Generate 20 random indices without replacement
random_indices = random.sample(range(num_rows), 20)

# Extract the corresponding rows from the data
data = [full_data[i] for i in random_indices]

# Print the random data
print(data)

[{'input': ' What approach does the UK propose for AI regulation compared to the EU?', 'output': " Unlike the EU's rules-based approach, the UK proposes a contextual, sector-based regulatory framework, utilizing existing regulators and laws, supplemented by new 'central functions' to support AI regulation."}, {'input': 'What is the affordable connectivity program and is it ending soon?', 'output': 'Affordable Connectivity Program (ACP) was enacted as part of the 2021 Infrastructure Investment and Jobs Act to ensure broadband access. It offers a $30 monthly subsidy to about 23 million homes. Funds are likely to run out in late April or May 2024.'}, {'input': 'What are the twin goals of Singapore\\u2019s NAIS 2.0(National AI Strategy)?', 'output': 'One is excellence: Singapore will selectively develop peaks of excellence in AI, to advance the field and maximize value creation. The other is empowerment: Singapore will raise up individuals, businesses, and communities to use AI with confid

In [None]:
rouge_metric = load_metric("rouge")

# Prepare for BLEU calculation
predictions = []
references = []

for item in data:
    input_text = item['input']
    reference_texts = item['output']

    # Generate prediction
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = inputs.to('cuda')
    output_sequences = fine_tuned_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=200)
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    predictions.append(generated_text)
    references.append(reference_texts)

# Compute BLEU score
results = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU score: {results.score}")

# Prepare references and predictions in the format expected by the metrics
flat_references = [item["output"] for item in data] # ROUGE expects a flat list of strings
predictions_text = [pred for pred in predictions] # ROUGE


# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions_text, references=flat_references)
print(f"ROUGE scores: {rouge_score}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BLEU score: 4.833747231081994
ROUGE scores: {'rouge1': AggregateScore(low=Score(precision=0.12337332012472785, recall=0.43093395676728524, fmeasure=0.19125494089834266), mid=Score(precision=0.14955848559112456, recall=0.4974924879673184, fmeasure=0.2225600416493507), high=Score(precision=0.18626236227029347, recall=0.5741263836276083, fmeasure=0.2617330109987263)), 'rouge2': AggregateScore(low=Score(precision=0.034028232483998465, recall=0.12086882236149982, fmeasure=0.05221767511850936), mid=Score(precision=0.05544563860272626, recall=0.19486320745115257, fmeasure=0.08351803277646239), high=Score(precision=0.08259043485382457, recall=0.29589685276869226, fmeasure=0.12469290420190768)), 'rougeL': AggregateScore(low=Score(precision=0.0951641958264639, recall=0.3296459976041544, fmeasure=0.14606955039691877), mid=Score(precision=0.1190762353489545, recall=0.39763221642034613, fmeasure=0.17747240578923124), high=Score(precision=0.15020729780976166, recall=0.48903247390968313, fmeasure=0.2

## Benchmarking against Vanilla Llama 2 Chat HF

In [None]:
rouge_metric = load_metric("rouge")

# Prepare for BLEU calculation
predictions = []
references = []

for item in data:
    input_text = item['input']
    reference_texts = item['output']

    # Generate prediction
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = inputs.to('cuda')
    output_sequences = untuned_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=200)
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    predictions.append(generated_text)
    references.append(reference_texts)

# Compute BLEU score
results = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU score: {results.score}")

# Prepare references and predictions in the format expected by the metrics
flat_references = [item["output"] for item in data] # ROUGE expects a flat list of strings
predictions_text = [pred for pred in predictions] # ROUGE


# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions_text, references=flat_references)
print(f"ROUGE scores: {rouge_score}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BLEU score: 3.4612234912270003
ROUGE scores: {'rouge1': AggregateScore(low=Score(precision=0.1366906474820144, recall=0.5135135135135135, fmeasure=0.21590909090909088), mid=Score(precision=0.1366906474820144, recall=0.5135135135135135, fmeasure=0.21590909090909088), high=Score(precision=0.1366906474820144, recall=0.5135135135135135, fmeasure=0.21590909090909088)), 'rouge2': AggregateScore(low=Score(precision=0.057971014492753624, recall=0.2222222222222222, fmeasure=0.09195402298850576), mid=Score(precision=0.057971014492753624, recall=0.2222222222222222, fmeasure=0.09195402298850576), high=Score(precision=0.057971014492753624, recall=0.2222222222222222, fmeasure=0.09195402298850576)), 'rougeL': AggregateScore(low=Score(precision=0.10071942446043165, recall=0.3783783783783784, fmeasure=0.1590909090909091), mid=Score(precision=0.10071942446043165, recall=0.3783783783783784, fmeasure=0.1590909090909091), high=Score(precision=0.10071942446043165, recall=0.3783783783783784, fmeasure=0.15909

It is evident that the BLEU score for our fine-tuned model is greater than that of vanilla model.