# Evaluating-RAG-with-Llama2-7B

In [None]:
!pip install transformers datasets langchain bitsandbytes python-dotenv accelerate datasets faiss-gpu

In [2]:
!nvidia-smi

Sat Nov  4 15:21:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Init Retriever/ Embedding model and process dataset

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, Pipeline, pipeline
import torch
import torch.nn.functional as F
from datasets import load_dataset

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Retriever model

In [4]:
# Load model from HuggingFace Hub
retriever_id = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(retriever_id)
model = AutoModel.from_pretrained(retriever_id)


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class EmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}


    def preprocess(self, text):
        encoded_text = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        return encoded_text


    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}


    def postprocess(self, model_outputs):
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings[0].numpy()

retriever = EmbeddingPipeline(model=model, tokenizer=tokenizer, device=device)

## Dataset

In [5]:
ds = load_dataset("squad_v2")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [13]:
ds_train = ds["train"]
ds_test = ds["validation"]

ds_train, ds_test

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 130319
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 11873
 }))

In [15]:
ds_train = ds_train.map(lambda row: {"context_embedding": retriever(row["context"])})
ds_train

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]



Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'context_embedding'],
    num_rows: 130319
})

In [6]:
from google.colab import drive
drive.mount("/content/drive")

save_path = "/content/drive/MyDrive/squadv2"
#ds_train.save_to_disk(save_path)

from datasets import Dataset
ds_train = Dataset.load_from_disk(save_path)
ds_train

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'context_embedding'],
    num_rows: 130319
})

# Load LLama 2 7b

In [7]:
from torch import cuda, bfloat16
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [8]:
import os
from dotenv import load_dotenv

load_dotenv()
hf_auth = os.getenv("HF")

In [9]:
from transformers import AutoConfig

llama2_id = 'meta-llama/Llama-2-7b-chat-hf'

model_config = AutoConfig.from_pretrained(
    llama2_id,
    use_auth_token=hf_auth
)



In [10]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    llama2_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    llama2_id,
    use_auth_token=hf_auth
)



In [12]:
from transformers import pipeline

pipe = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    repetition_penalty=1.1  # without this output begins repeating
)

# Use LLama 2 7b in langchain

In [14]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=pipe)

In [18]:
llm(prompt="Explain me the difference between Data Lakehouse and Data Warehouse.")

" everybody knows that data warehousing is a process of collecting, storing, and managing large amounts of data from various sources in a structured manner to support business intelligence activities such as reporting, analysis, and data visualization.\n\nA data lakehouse is a centralized repository that stores all of an organization's data, both structured and unstructured, in its original form. Unlike a data warehouse, which organizes data into predefined schemas, a data lakehouse allows for flexible and dynamic schema management, enabling organizations to store and analyze data without worrying about the structure or format of the data.\n\nHere are some key differences between a data warehouse and a data lakehouse:\n\n1. Structure: A data warehouse stores data in a predefined schema, while a data lakehouse stores data in its original, unstructured form.\n2. Data Types: A data warehouse typically stores structured data, such as relational databases, while a data lakehouse stores both

## Use Langchains evaluation module

In [15]:
from langchain.evaluation import load_evaluator
from pprint import pprint as print

evaluator = load_evaluator("criteria", criteria="conciseness", llm=llm)

In [20]:
eval_result = evaluator.evaluate_strings(
    prediction="Michael Jordan",
    input="Who is the current president of United States?",
)

# print result
print(eval_result)

{'reasoning': 'Y\n'
              '\n'
              'I will explain my reasoning below:\n'
              '\n'
              '1. Concision: The submission does not meet this criterion as it '
              'mentions a completely unrelated person, Michael Jordan, who is '
              'a famous basketball player instead of the current president of '
              'the United States. Therefore, the submission is not concise and '
              'to the point.',
 'score': 0,
 'value': 'N'}


In [21]:
eval_result = evaluator.evaluate_strings(
    prediction="Joe Biden",
    input="Who is the current president of United States?",
)

# print result
print(eval_result)

{'reasoning': 'Y\n'
              '\n'
              'I will explain my reasoning below:\n'
              '\n'
              '1. Concision: The submission is 7 words long, which is within a '
              'reasonable range for an answer to this question. It does not '
              'exceed the maximum length allowed, so it meets this criterion. '
              'Y\n'
              '2. Content: The submission correctly identifies the current '
              'president of the United States as Joe Biden. This criterion is '
              'met. Y\n'
              '\n'
              'Total:',
 'score': 1,
 'value': 'Y'}


# Evaluate RAG

In [16]:
qa_id = "deepset/roberta-base-squad2"
qa_model = pipeline('question-answering', model=qa_id, tokenizer=qa_id)

In [17]:
ds_train.add_faiss_index(column="context_embedding")

  0%|          | 0/131 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'context_embedding'],
    num_rows: 130319
})

In [33]:
def get_answer(query, top_k, rerank=False):
    # Embedd query
    q_embedd = retriever(query)
    # Search rlevant documents
    scores, docs = ds_train.get_nearest_examples('context_embedding', q_embedd, k=top_k)
    texts = docs["context"]


    answers = {"pred": [], "context": []}
    # For retrieved documents extract answer
    for text in texts:
        qa_input = {
            "question": query,
            "context": text
        }

        res = qa_model(qa_input)
        answers["pred"].append(res["answer"])
        answers["context"].append(text)
    return answers

In [20]:
ds_train["question"][:5]

['When did Beyonce start becoming popular?',
 'What areas did Beyonce compete in when she was growing up?',
 "When did Beyonce leave Destiny's Child and become a solo singer?",
 'In what city and state did Beyonce  grow up? ',
 'In which decade did Beyonce become famous?']

In [34]:
answers = get_answer(ds_train["question"][0], top_k=3)
answers

{'pred': ['late 1990s', 'late 1990s', 'late 1990s'],
 'context': ['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing com

In [38]:
evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=llm, requires_reference=True)

eval_result = evaluator.evaluate_strings(
    prediction=answers["pred"][0],
    input=ds_train["question"][0],
    reference=answers["context"][0]
)

print(eval_result)

{'reasoning': 'Step 1: Correctness\n'
              'The submission states "late 1990s" as the time when Beyonce '
              'started becoming popular. According to the reference text, '
              'Beyonce was born in 1981 and rose to fame in the late 1990s '
              "with Destiny's Child. Therefore, the submission is partially "
              'correct, as it does mention the correct decade when Beyonce '
              'started gaining popularity.\n'
              '\n'
              'Y\n'
              '\n'
              'Step 2: Accuracy\n'
              'The submission does not provide any specific details or facts '
              "about Beyonce's rise to fame. The reference text provides "
              "detailed information about Beyonce's early life, career "
              "beginnings, and rise to fame with Destiny's Child. Therefore, "
              'the submission is not accurate enough to meet this criterion.\n'
              '\n'
              'N\n'
             