# Decoder-Test-No-Loop

In [1]:
!pip install -U transformers sentencepiece bitsandbytes accelerate rouge_score langchain python-dotenv install -U "ray" --quiet

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
hf_auth = os.getenv("HF")

In [3]:
from huggingface_hub import login
login(token=hf_auth)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/chkei001/.cache/huggingface/token
Login successful


In [4]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Evaluator LLM (Llama 2 70b)

In [5]:
from torch import bfloat16
from transformers import BitsAndBytesConfig

evaluator_bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [6]:
from transformers import AutoConfig

model_id = "meta-llama/Llama-2-70b-chat-hf"

model_config = AutoConfig.from_pretrained(
    model_id,
    token=True
)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

llama = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=evaluator_bnb_config,
    device_map='auto',
    token=True
)

llama_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=True
)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]



In [8]:
from transformers import pipeline

pipe = pipeline(
    model=llama,
    tokenizer=llama_tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    repetition_penalty=1.1  # without this output begins repeating
)

2023-12-07 07:51:25.122610: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-07 07:51:25.954123: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-12-07 07:51:25.954278: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [9]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=pipe)

## Decoder Pipeline and evaluation

In [10]:
from datasets import load_dataset

ds = load_dataset("squad_v2")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [11]:
trn = ds["train"]
val = ds["validation"]

In [12]:
from transformers import Pipeline, TextIteratorStreamer
from threading import Thread

class DecoderPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, text):
        # TODO: Prompt Template -> tokenizer.apply_chat_template
        if self.tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        return self.tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(self.device)

    def _forward(self, inputs):
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=200)
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        generated_text = ""
        for new_text in streamer:
            generated_text += new_text
        generated_text
        return generated_text

    def postprocess(self, outputs):
        outputs = outputs.replace(".\n", " ")
        outputs = outputs.replace("\n", "")
        return outputs

In [13]:
def get_prompt(question, context):
    return f"""Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.
    question: {question}\n\n
    context: {context}\n\n
    Answer:\n"""

In [14]:
from langchain.evaluation import load_evaluator

qa_evaluator = load_evaluator("context_qa", llm=llm)

def evaluate_context_qa(question, prediction, context, answer):
    res = qa_evaluator.evaluate_strings(
        input=question,
        prediction=prediction,
        context=context,
        reference=answer # Answer
    )
    return res.get("score")

In [15]:
correctness_evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=llm, requires_reference=True)

def evaluate_correctness(prompt, prediction, context):
    res = correctness_evaluator.evaluate_strings(
        input=prompt,
        prediction=prediction,
        context=context,
        reference=context # Answer
    )
    return res.get("score"), res.get("reasoning")

### HuggingFaceH4/zephyr-7b-beta

In [16]:
from tqdm import tqdm
import pandas as pd
from transformers import logging
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

iterations = 120

results = []

model_id = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto') 
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)#, device=device)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
        
        torch.cuda.empty_cache()
        pbar.update(1)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 120/120 [43:11<00:00, 21.59s/it] 


In [17]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [18]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

# microsoft/Orca-2-13b

In [19]:
iterations = 120

results = []

model_id = "microsoft/Orca-2-13b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')  
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)#, device=device)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
            
        torch.cuda.empty_cache()
        pbar.update(1)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 120/120 [1:11:45<00:00, 35.88s/it]


In [20]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [21]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

### mistralai/Mistral-7B-Instruct-v0.1

In [None]:
iterations = 120
pbar = tqdm(total=iterations)

results = []

model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')  
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
            
        torch.cuda.empty_cache()
        pbar.update(1)

In [None]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [None]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

### Intel/neural-chat-7b-v3-1

In [None]:
iterations = 120
pbar = tqdm(total=iterations)

results = []

model_id = "Intel/neural-chat-7b-v3-1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')  
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
            
        torch.cuda.empty_cache()
        pbar.update(1)

In [None]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [None]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

### tiiuae/falcon-7b-instruct

In [None]:
iterations = 120
pbar = tqdm(total=iterations)

results = []

model_id = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')  
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
            
        torch.cuda.empty_cache()
        pbar.update(1)

In [None]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [None]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

### meta-llama/Llama-2-7b-chat-hf

In [None]:
iterations = 120
pbar = tqdm(total=iterations)

results = []

model_id = "meta-llama/Llama-2-7b-chat-hf"

model_config = AutoConfig.from_pretrained(
            pretrained_model_name_or_path=model_id,
            token=True
        )
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', config=model_config)   
decoder = DecoderPipeline(model=model, tokenizer=tokenizer)

with tqdm(total=iterations) as pbar:
    for i in range(0, iterations):
        # Data
        question = val[i]["question"]
        context = val[i]["context"]
        answer_list = val[i]["answers"].get("text")
        answer = "I don't know"
        is_possible = False
        if len(answer_list) > 0:
            is_possible = True
            answer = answer_list[0]
            
        # Inference
        prompt = get_prompt(question=question, context=context)
        pred = decoder(prompt)
    
        # Evaluation
        qa_score = evaluate_context_qa(
            question=question,
            prediction=pred,
            context=context,
            answer=answer
        )
        correctness_score, reasoning = evaluate_correctness(
            prompt=prompt,
            prediction=pred,
            context=context
        )
    
        results.append(
            {
                "name": model_id,
                "question": question,
                "prediction": pred,
                "is_possible": is_possible,
                "context": context,
                "qa_score": qa_score,
                "correctness_score": correctness_score,
                "reasoning": reasoning,
                "prompt": prompt
            }
        )
            
        torch.cuda.empty_cache()
        pbar.update(1)

In [None]:
df = pd.DataFrame(results)
df.to_pickle(model_id.replace("/", "-") + ".pkl")

In [None]:
# clean up
del model
del tokenizer
del decoder
torch.cuda.empty_cache()

In [None]:
del llama

In [None]:
torch.cuda.empty_cache()

In [None]:
    #"meta-llama/Llama-2-7b-chat-hf",
    #"meta-llama/Llama-2-70b-chat-hf", -> Config

In [None]:
evaluator