# Benchmark config

In [1]:
rag_enabled = True
quantization = '8bit' # Valid values: None, '8bit', '4bit'
results_file_name = 'rag_8bit.csv'

RAGPipeline_module_dir = '/home/matlab/data/Dominik/app'
rag_adapter_path = './fine_tuning/fine_tuned_models'

In [2]:
import re
import sys

import matplotlib.pyplot as plt
import pandas as pd
import torch
import transformers
from datasets import load_dataset
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import MistralConfig, BitsAndBytesConfig
from tqdm.notebook import tqdm

if rag_enabled:
    sys.path.append(RAGPipeline_module_dir)
    from RAGPipeline import RAGPipeline

In [3]:
def prepare_sample_questions(sample_questions):
    letter_answers = ['A', 'B', 'C', 'D']
    sample_questions_dicts = []
    for idx in range(len(sample_questions['question'])):
        sample_question_dict = {}
        sample_question_dict['question'] = sample_questions['question'][idx]
        sample_question_dict['answer_A'] = sample_questions['choices'][idx][0]
        sample_question_dict['answer_B'] = sample_questions['choices'][idx][1]
        sample_question_dict['answer_C'] = sample_questions['choices'][idx][2]
        sample_question_dict['answer_D'] = sample_questions['choices'][idx][3]
        sample_question_dict['correct_answer'] = letter_answers[sample_questions['answer'][idx]]
        sample_questions_dicts.append(sample_question_dict)
    return sample_questions_dicts


def prepare_prompt(domain, sample_questions, question, question_context = '\nNone\n'):
    domain = re.sub("_", " ", domain)
    # Prompt begining
    prompt = "<s>[INST] The following are multiple choice questions (with answers) about {} with optional, helpful context for the last question.".format(domain)
    prompt += "\n\n<context>{}</context>\n".format(question_context)
    question_template = "\nQuestion: {}\nA: {}\nB: {}\nC: {}\nD: {}\nAnswer: {}"
    # Sample questions with answers
    for example in sample_questions:
        example_question = question_template.format(
            example['question'],
            example['answer_A'],
            example['answer_B'],
            example['answer_C'],
            example['answer_D'],
            example['correct_answer']
        )
        prompt += example_question + '\n'
    # Question that the model must answer
    question = question_template.format(
            question['question'],
            question['choices'][0],
            question['choices'][1],
            question['choices'][2],
            question['choices'][3],
            ''
        )
    prompt += question + '[/INST]'
    return prompt


def convert_answer(answer):
    """
    Convert the model's answer (A, B, C or D) to a numerical equivalent in the MMLU
    benchmark

    Parameters:
    answer (str): the model's answer (only the first character matters)

    Returns:
    (int): numerical equivalent to the answer in the MMLU dataset
    """
    if len(answer) == 0:
        return 4 # No answer doesn't meet requirements
    elif answer[0] == 'A' or answer[0] == 'a':
        return 0
    elif answer[0] == 'B' or answer[0] == 'b':
        return 1
    elif answer[0] == 'C' or answer[0] == 'c':
        return 2
    elif answer[0] == 'D' or answer[0] == 'd':
        return 3
    else:
        return 4 # Answer doesn't meet stated requirements

def sanitize_IDs(model_output, ids_max_count=4, max_id=7):
    sanitized_IDs = []
    # If the model didn't output anything, return an empty list
    if len(model_output) == 0:
        return []
    for idx, el in enumerate(model_output):
        # print(el[0])
        # Break the loop after reaching idx_max_count of if el
        # is an empty string
        if idx == ids_max_count or el == '':
            break
        # If there's a -1, then non of the ids are relevant - return an empty list
        elif el[0].isdigit() and int(el[0]) == -1:
            return []
        elif el[0].isdigit() and int(el[0]) >= 0 and int(el[0]) <= max_id:
            sanitized_IDs.append(int(el[0]))
        else:
            break
    return sanitized_IDs

def select_docs_by_id(documents_raw, ids):
    documents_selected = []
    for doc in documents_raw:
        if doc[0].metadata['ID'] in ids:
            documents_selected.append(doc)
    return documents_selected

# Load the base model

In [4]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Choose quantization type
if quantization == '8bit':
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    model = AutoModelForCausalLM.from_pretrained(base_model, config=MistralConfig, quantization_config=bnb_config, device_map='cuda')
    print('Model loaded with 8-bit quantization')
elif quantization == '4bit':
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=False,
    )
    model = AutoModelForCausalLM.from_pretrained(base_model, config=MistralConfig, quantization_config=bnb_config, device_map='cuda')
    print('Model loaded with 4-bit quantization')
else:
    model = AutoModelForCausalLM.from_pretrained(base_model, config=MistralConfig, device_map='cuda')
    print('Model loaded with no quantization')

# Load the RAG adapter if RAG is enabled
if rag_enabled:
    model.load_adapter(rag_adapter_path, adapter_name='rag_adapter')
    model.set_adapter('rag_adapter')
    model.disable_adapters()
    rag_pipeline = RAGPipeline()
    print('RAG pipeline loaded')
    rag_prompt_template = """<s>[INST] Below is a list of documents. Return up to 4 IDs of documents most useful for solving the user_prompt. If no documents are relevant, output -1. {format}. 
    
<documents>
{documents}
</documents>

user_prompt: {user_prompt}

[/INST]IDs: """
    output_parser = CommaSeparatedListOutputParser()
else:
    print('RAG disabled')

# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", config=MistralConfig)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1, device=0)
# pipe.tokenizer.pad_token_id = model.config.eos_token_id
# LLM = HuggingFacePipeline(pipeline=pipe)

# prompt_template = """<s>[INST] Your objective is to select the right answer for the question stated below.
# If question 'A' is the correct answer, output 'A'; if 'B' is the correct answer, output 'B', and so on.
# Do not explain or justify your answer. Your output must be a single letter: 'A', 'B', 'C', or 'D'.
# Question: {question}
# A: {answer_A}
# B: {answer_B}
# C: {answer_C}
# D: {answer_D}
# Your answer: [/INST]
# """

# prompt = PromptTemplate.from_template(prompt_template)
# chain = prompt | LLM



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded with 8-bit quantization


<All keys matched successfully>


RAG pipeline loaded


In [5]:
MMLU_domains = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science',
                'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering',
                'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
                'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics',
                'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
                'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics',
                'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law',
                'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
right_answers = {}
question_count = {}
bad_format_answers = {}
for domain in MMLU_domains:
    samples_changed = False
    dataset = load_dataset("cais/mmlu", domain, split='test', trust_remote_code=True)
    right_answers[domain] = 0
    bad_format_answers[domain] = 0
    question_count[domain] = len(dataset)
    # Prepare sample questions
    sample_questions = dataset[-5:]
    sample_questions = prepare_sample_questions(sample_questions)
    # Prepare sample questions
    for idx, question in enumerate(tqdm(dataset)):
        ### Inference goes here
        if rag_enabled:
            torch.cuda.empty_cache()
            prompt = question['question']
            documents_raw = rag_pipeline.retrieve_relevant_data(prompt)
            if len(documents_raw) > 0:
                documents_llm = rag_pipeline.format_docs_for_LLM(documents_raw)
                RAG_prompt = rag_prompt_template.format(format=output_parser.get_format_instructions(), documents=documents_llm, user_prompt=prompt)
                tokenized_context = tokenizer(RAG_prompt, return_tensors="pt").to('cuda')
                # Bypass the LLM filter if there are too many tokens to handle
                if len(tokenized_context.input_ids[0]) > 5000:
                    if len(documents_raw) > 3:
                        document_IDs_sanitized = [0,1,2,3]
                    else:
                        document_IDs_sanitized = [i for i in range(len(documents_raw))]
                else:
                    model.enable_adapters()
                    response = model.generate(tokenized_context.input_ids, pad_token_id=2, attention_mask=tokenized_context.attention_mask, do_sample=False, max_new_tokens=8)
                    model.disable_adapters()
                    response = response[0][tokenized_context.input_ids.shape[1]:] # Remove the input from the output
                    output = tokenizer.decode(response, skip_special_tokens=True)
                    document_IDs = output_parser.parse(output)
                    document_IDs_sanitized = sanitize_IDs(document_IDs)
                if len(document_IDs_sanitized) > 0:
                    relevant_documents_raw = select_docs_by_id(documents_raw, document_IDs_sanitized)
                    relevant_documents_llm = rag_pipeline.format_doc_for_LLM_no_ids(relevant_documents_raw)
                    relevant_documents_llm = "\n" + relevant_documents_llm
                if len(relevant_documents_raw) > 0:
                    prompt = prepare_prompt(domain, sample_questions, question, relevant_documents_llm)
                else:
                    prompt = prepare_prompt(domain, sample_questions, question)
        else:
            prompt = prepare_prompt(domain, sample_questions, question)
        tokenized_context = tokenizer(prompt, return_tensors="pt").to('cuda')
        response = model.generate(tokenized_context.input_ids, pad_token_id=2, attention_mask=tokenized_context.attention_mask, do_sample=False, max_new_tokens=1)
        response = response[0][tokenized_context.input_ids.shape[1]:] # Remove the input from the output
        answer = tokenizer.decode(response, skip_special_tokens=True)
        ### Inference goes here
        llm_answer = convert_answer(answer)
        if llm_answer == question['answer']:
            right_answers[domain] += 1
        elif llm_answer == 4:
            bad_format_answers[domain] += 1
        # After going over approx 50% of questions, select sample questions from
        # beginning of the dataset
        if not samples_changed and idx > len(dataset) / 2:
            sample_questions = dataset[:5]
            sample_questions = prepare_sample_questions(sample_questions)
            samples_changed = True
            torch.cuda.empty_cache()
        

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/265 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

  0%|          | 0/378 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/198 [00:00<?, ?it/s]

  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/390 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

  0%|          | 0/238 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/545 [00:00<?, ?it/s]

  0%|          | 0/216 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/223 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/121 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/234 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/783 [00:00<?, ?it/s]

  0%|          | 0/346 [00:00<?, ?it/s]

  0%|          | 0/895 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

  0%|          | 0/311 [00:00<?, ?it/s]

  0%|          | 0/324 [00:00<?, ?it/s]

  0%|          | 0/282 [00:00<?, ?it/s]

  0%|          | 0/1534 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/612 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/245 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/171 [00:00<?, ?it/s]

In [6]:
results = pd.DataFrame(columns=[
    'category',
    'question_count',
    'right_answers',
    'right_answers_percent',
    'right_answers_percent_(ignore_bad_format)',
    'bad_format_answers',
    'bad_format_answers_percent'
])
for domain in MMLU_domains:
    right_answers_percent = round(right_answers[domain] / question_count[domain] * 100, 3)
    right_answers_percent_ignore_bad_format = round(right_answers[domain] / (question_count[domain] - bad_format_answers[domain]) * 100, 3)
    bad_format_answers_percent = round(bad_format_answers[domain] / question_count[domain] * 100, 3)
    row = {
        'category': domain, 
        'question_count': question_count[domain],
        'right_answers': right_answers[domain],
        'right_answers_percent': right_answers_percent,
        'right_answers_percent_(ignore_bad_format)': right_answers_percent_ignore_bad_format,
        'bad_format_answers': bad_format_answers[domain],
        'bad_format_answers_percent': bad_format_answers_percent
    }
    results.loc[len(results)] = row
results.to_csv(results_file_name)