## Import packages

In [1]:
! pip install nltk numpy pandas unidecode python-dotenv tqdm rouge-score chromadb pdfminer.six docarray pymupdf llm-blender
! pip install transformers torch scikit-learn
! pip install langchain langchain-core langchain-community langchain_experimental langchain-chroma langchain_mistralai



In [2]:
import llm_blender
import os
import json
import nltk
import string
import numpy as np
import pandas as pd
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from getpass import getpass
from rouge_score import rouge_scorer

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


## Setup environment variables

You have to define the following environment variables in the `.env` file, terminal environment, or input field within this Jupyter notebook:
1. MISTRAL_API_KEY
2. OPENAI_API_KEY

## Import packages

In [3]:
env_variables = [
  'MISTRAL_API_KEY',
  'OPENAI_API_KEY',
]

load_dotenv()

for key in env_variables:
  value = os.getenv(key)

  if value is None:
    value = getpass(key)

  os.environ[key] = value

## Download NLTK dictionaries

These dictionaries are needed for further text preprocessing.

In [4]:
dict_ids = [
  'punkt_tab',
  'punkt',
  'stopwords',
  'wordnet',
]

for dict_id in dict_ids:
  nltk.download(dict_id, quiet=True)

## Setup metrics

## Setup LLMs

### Text preprocessing

Define a function for text preprocessing, which is an important step before calculating any metrics. This preprocessing function will help in cleaning the text data, making it ready for further analysis. The preprocessing involves several steps:
1. Lowercasing
2. Stopwords removal
3. Lemmatization
4. Remove accents from characters

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess(corpus: str) -> str:
  corpus = corpus.lower()
  stopset = nltk.corpus.stopwords.words('english') + nltk.corpus.stopwords.words('russian') + list(string.punctuation)
  tokens = nltk.word_tokenize(corpus)
  tokens = [t for t in tokens if t not in stopset]
  tokens = [lemmatizer.lemmatize(t) for t in tokens]
  corpus = ' '.join(tokens)
  corpus = unidecode(corpus)
  return corpus

### Embedding Initialization

Here we are initializing the Llama 3 embeddings model. The `OllamaEmbeddings` class is a component of the Ollama library, a set of pre-trained language models. This model is capable of embedding corpora of any length into a 4096-dimensional vector.

The use of `OllamaEmbeddings` requires the installation of a local Ollama server, which can be found at https://ollama.com.

In [6]:
embeddings = OllamaEmbeddings(model='llama3.1')

### Average embeddings cosine similarity metric

This function calculates the average cosine similarity between expected answers and LLM predicted answers using their respective embeddings. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them:

$$
K(a, b) = \frac{\sum \limits_{i=1}^n a_i b_i}{\sqrt{\sum \limits_{i=1}^n a_i^2} \cdot \sqrt{\sum \limits_{i=1}^n b_i^2}}
$$

In [7]:
def embeddings_cosine_sim_metric(expected_answers: list[str], predicted_answers: list[str]) -> float:
  results = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    expected_embedding = np.array(embeddings.embed_query(expected_answer))
    predicted_embedding = np.array(embeddings.embed_query(predicted_answer))

    sim = cosine_similarity(
      expected_embedding.reshape(1, -1),
      predicted_embedding.reshape(1, -1),
    )[0][0]

    results.append(sim)

  return np.mean(results)

### BLEU Metric

This function calculates the average BLEU (Bilingual Evaluation Understudy) score between expected answers and predicted answers. The BLEU score is a measure that compares a candidate translation of text to one or more reference translations.

A smoothing function is defined to calculate the BLEU score. Smoothing is useful when a perfect match is not found. It ensures that the BLEU scores aren't zero.

In [8]:
smoothie_f = nltk.translate.bleu_score.SmoothingFunction().method4

def bleu_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    predicted_tokens = nltk.word_tokenize(predicted_answer)
    expected_tokens = [nltk.word_tokenize(expected_answer)]

    score = nltk.translate.bleu_score.sentence_bleu(
      expected_tokens,
      predicted_tokens,
      smoothing_function=smoothie_f,
    )

    scores.append(score)

  return np.mean(scores)

### ROGUE-1 (Recall-Oriented Understudy for Gisting Evaluation 1-gram Scoring)

In [9]:
rogue_1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def rogue_1_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    result = rogue_1_scorer.score(expected_answer, predicted_answer)

    scores.append(result['rouge1'])

  return np.mean(scores)

### ROGUE-L (Recall-Oriented Understudy for Gisting Evaluation Longest Common Subsequence)

In [10]:
rogue_l_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rogue_l_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    result = rogue_l_scorer.score(expected_answer, predicted_answer)

    scores.append(result['rougeL'])

  return np.mean(scores)

## Setup LLMs

### Llama 3.1

In [11]:
def get_llama3_1_llm(temperature=0):
  return Ollama(model='llama3.1', temperature=temperature)

### Mistral Large

In [12]:
def get_mistral_large_llm(temperature=0):
  return ChatMistralAI(
    model='mistral-large-latest',
    temperature=temperature,
  )

### GPT-4o mini

In [13]:
def get_chatgpt_40_mini_llm(temperature=0):
  return ChatOpenAI(
    model='gpt-4o-mini',
    temperature=temperature,
  )

## Setup experiments

### Load QA dataset

In [14]:
qa_df = pd.read_csv('brainscape.csv')[:214]
qa_df

Unnamed: 0,question,answer
0,What are the afferent cranial nerve nuclei?,Trigeminal sensory nucleus- fibres carry gener...
1,What is the order of the cranial nerves ?,1-olfactory\n2-optic\n3-oculomotor\n4-trochlea...
2,What are the efferent cranial nerve nuclei?,Edinger-westphal nucleus\nOculomotor nucleus\n...
3,Which nuclei share the embryo logical origin -...,Oculomotor nucleus Trochlear nucleus Abducens ...
4,Which nuclei share the embryo logical origin- ...,Trigeminal motor nucleus Facial motor nucleus ...
...,...,...
209,What are spontaneous active bursting neurones ?,These are pacemaker neurons They don’t require...
210,What is the patch-clamp technique ?,Enables recordings of electrical activity of a...
211,What toxin can be used to block sodium currents ?,Tetrodotoxin-it blocks sodium channels
212,What toxin can block potassium currents ?,Tetrammonium - blocks potassium channels


### Load cached RAGs responses

In [15]:
cache_path = Path('cache.json')

if not os.path.exists(cache_path):
  data = {}
  with open(cache_path, 'w') as file:
    json.dump(data, file)

with open(cache_path, 'r') as f:
  cache = json.load(f)

len(cache.keys())

0

### Setup experiment grid search parameters

In [16]:
llms = (
  ('GPT-4o mini', get_chatgpt_40_mini_llm()),
  ('Mistral Large', get_mistral_large_llm()),
  ('LLaMA 3.1', get_llama3_1_llm()),
)

### Conduct the experiment

In [17]:
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM")
blender.loadfuser("llm-blender/gen_fuser_3b")



AssertionError: Torch not compiled with CUDA enabled

In [90]:
df = pd.DataFrame()

questions = qa_df['question'].tolist()
expected_answers = qa_df['answer'].tolist()

predicted_answers = []

for question in tqdm(questions, desc='Questions'):
  if not question in cache:
    llm_answers = []

    for llm_name, llm in llms:
      chain = llm | StrOutputParser()
      llm_answer = chain.invoke(question)
      llm_answers.append(llm_answer)

    fuse_generations, ranks = blender.rank_and_fuse([question], [llm_answers], instructions=[''], return_scores=False, batch_size=2, top_k=3)
    cache[question] = fuse_generations[0]

  predicted_answers.append(cache[question])

  with open(cache_path, 'w') as f:
    json.dump(cache, f)

# Evaluate metrics
cos_sim = embeddings_cosine_sim_metric(expected_answers, predicted_answers)
bleu_score = bleu_metric(expected_answers, predicted_answers)
rogue_1_score = rogue_1_metric(expected_answers, predicted_answers)
rogue_l_score = rogue_l_metric(expected_answers, predicted_answers)

cos_sim, bleu_score, rogue_1_score, rogue_l_score

Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 14.90it/s]
Fusing candidates: 100%|██████████| 1/1 [00:01<00:00,  1.36s/it]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 20.89it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
Fusing candidates: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 20.79it/s]
Fusing candidates: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 29.46it/s]
Fusing candidates: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 23.24it/s]
Fusing candidates: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 28.04it/s]
Fusing candidates: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
Ranking candidates: 100%|███████

(0.6120309928894316,
 0.023211538364972827,
 0.20501471533742882,
 0.17992878112524957)