## Import packages

In [35]:
! pip install nltk numpy pandas unidecode python-dotenv tqdm rouge-score chromadb pdfminer.six docarray pymupdf
! pip install transformers torch scikit-learn
! pip install langchain langchain-core langchain-community langchain_experimental langchain-chroma langchain_mistralai



In [36]:
import os
import json
import nltk
import string
import numpy as np
import pandas as pd
from unidecode import unidecode
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from getpass import getpass
from rouge_score import rouge_scorer

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_openai import ChatOpenAI

## Setup environment variables

You have to define the following environment variables in the `.env` file, terminal environment, or input field within this Jupyter notebook:
1. MISTRAL_API_KEY

In [37]:
env_variables = [
  'MISTRAL_API_KEY',
  'OPENAI_API_KEY',
]

load_dotenv()

for key in env_variables:
  value = os.getenv(key)

  if value is None:
    value = getpass(key)

  os.environ[key] = value

## Download NLTK dictionaries

These dictionaries are needed for further text preprocessing.

In [38]:
dict_ids = [
  'punkt_tab',
  'punkt',
  'stopwords',
  'wordnet',
]

for dict_id in dict_ids:
  nltk.download(dict_id, quiet=True)

## Setup metrics

### Text preprocessing

Define a function for text preprocessing, which is an important step before calculating any metrics. This preprocessing function will help in cleaning the text data, making it ready for further analysis. The preprocessing involves several steps:
1. Lowercasing
2. Stopwords removal
3. Lemmatization
4. Remove accents from characters

In [39]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess(corpus: str) -> str:
  corpus = corpus.lower()
  stopset = nltk.corpus.stopwords.words('english') + nltk.corpus.stopwords.words('russian') + list(string.punctuation)
  tokens = nltk.word_tokenize(corpus)
  tokens = [t for t in tokens if t not in stopset]
  tokens = [lemmatizer.lemmatize(t) for t in tokens]
  corpus = ' '.join(tokens)
  corpus = unidecode(corpus)
  return corpus

### Embedding Initialization

Here we are initializing the Llama 3 embeddings model. The `OllamaEmbeddings` class is a component of the Ollama library, a set of pre-trained language models. This model is capable of embedding corpora of any length into a 4096-dimensional vector.

The use of `OllamaEmbeddings` requires the installation of a local Ollama server, which can be found at https://ollama.com.

In [40]:
embeddings = OllamaEmbeddings(model='llama3.1')

### Average embeddings cosine similarity metric

This function calculates the average cosine similarity between expected answers and LLM predicted answers using their respective embeddings. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them:

$$
K(a, b) = \frac{\sum \limits_{i=1}^n a_i b_i}{\sqrt{\sum \limits_{i=1}^n a_i^2} \cdot \sqrt{\sum \limits_{i=1}^n b_i^2}}
$$

In [41]:
def embeddings_cosine_sim_metric(expected_answers: list[str], predicted_answers: list[str]) -> float:
  results = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    expected_embedding = np.array(embeddings.embed_query(expected_answer))
    predicted_embedding = np.array(embeddings.embed_query(predicted_answer))

    sim = cosine_similarity(
      expected_embedding.reshape(1, -1),
      predicted_embedding.reshape(1, -1),
    )[0][0]

    results.append(sim)

  return np.mean(results)

### BLEU Metric

This function calculates the average BLEU (Bilingual Evaluation Understudy) score between expected answers and predicted answers. The BLEU score is a measure that compares a candidate translation of text to one or more reference translations.

A smoothing function is defined to calculate the BLEU score. Smoothing is useful when a perfect match is not found. It ensures that the BLEU scores aren't zero.

In [42]:
smoothie_f = nltk.translate.bleu_score.SmoothingFunction().method4

def bleu_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    predicted_tokens = nltk.word_tokenize(predicted_answer)
    expected_tokens = [nltk.word_tokenize(expected_answer)]

    score = nltk.translate.bleu_score.sentence_bleu(
      expected_tokens,
      predicted_tokens,
      smoothing_function=smoothie_f,
    )

    scores.append(score)

  return np.mean(scores)

### ROGUE-1 (Recall-Oriented Understudy for Gisting Evaluation 1-gram Scoring)

In [43]:
rogue_1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

def rogue_1_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    result = rogue_1_scorer.score(expected_answer, predicted_answer)

    scores.append(result['rouge1'])

  return np.mean(scores)

### ROGUE-L (Recall-Oriented Understudy for Gisting Evaluation Longest Common Subsequence)

In [44]:
rogue_l_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rogue_l_metric(expected_answers, predicted_answers):
  scores = []

  for expected_answer, predicted_answer in zip(expected_answers, predicted_answers):
    expected_answer = preprocess(expected_answer)
    predicted_answer = preprocess(predicted_answer)

    result = rogue_l_scorer.score(expected_answer, predicted_answer)

    scores.append(result['rougeL'])

  return np.mean(scores)

## Setup LLMs

### Llama 3.1

In [45]:
def get_llama3_1_llm(temperature=0):
  return Ollama(model='llama3.1', temperature=temperature)

### Mistral Large

In [46]:
def get_mistral_large_llm(temperature=0):
  return ChatMistralAI(
    model='mistral-large-latest',
    temperature=temperature,
  )

## GPT-4o mini

In [47]:
def get_chatgpt_40_mini_llm(temperature=0):
  return ChatOpenAI(
    model='gpt-4o-mini',
    temperature=temperature,
  )

## Setup experiments

### Load QA dataset

In [48]:
qa_df = pd.read_csv('brainscape.csv')
qa_df

Unnamed: 0,question,answer
0,What are the afferent cranial nerve nuclei?,Trigeminal sensory nucleus- fibres carry gener...
1,What is the order of the cranial nerves ?,1-olfactory\n2-optic\n3-oculomotor\n4-trochlea...
2,What are the efferent cranial nerve nuclei?,Edinger-westphal nucleus\nOculomotor nucleus\n...
3,Which nuclei share the embryo logical origin -...,Oculomotor nucleus Trochlear nucleus Abducens ...
4,Which nuclei share the embryo logical origin- ...,Trigeminal motor nucleus Facial motor nucleus ...
...,...,...
1047,What is the purpose of gephyrin in the glycine...,Involved in anchoring the receptor to a specif...
1048,What is the glycine receptor involved in ?,Reflex response\nCauses reciprocal inhibition ...
1049,What happens in hyperperplexia ?,It’s an exaggerated reflex Often caused by a m...
1050,What is hyperperplexia treated with ?,Benzodiazepine


### Setup experiment grid search parameters

In [49]:
llms = (
  ('GPT-4o mini', get_chatgpt_40_mini_llm()),
  ('Mistral Large', get_mistral_large_llm()),
  ('LLaMA 3.1', get_llama3_1_llm()),
)

### Load cached responses

In [50]:
cache_path = Path('cache.json')

if not os.path.exists(cache_path):
  data = {}
  with open(cache_path, 'w') as file:
    json.dump(data, file)

with open(cache_path, 'r') as f:
  cache = json.load(f)

cache.keys()

dict_keys(['LLaMA 3.1', 'Mistral Large', 'GPT-4o mini'])

### Conduct the grid search

In [51]:
from langchain_core.output_parsers import StrOutputParser

df = pd.DataFrame()

questions = qa_df['question'].tolist()
expected_answers = qa_df['answer'].tolist()

for llm_name, llm in tqdm(llms, desc='LLMs'):
  chain = llm | StrOutputParser()

  predicted_answers = []

  for question in tqdm(questions, desc='Questions', leave=False):
    if not llm_name in cache:
      cache[llm_name] = {}
    if not question in cache[llm_name]:
      cache[llm_name][question] = chain.invoke(question)

    predicted_answers.append(cache[llm_name][question])

    with open(cache_path, 'w') as f:
      json.dump(cache, f)

  # Evaluate metrics
  cos_sim = embeddings_cosine_sim_metric(expected_answers, predicted_answers)
  bleu_score = bleu_metric(expected_answers, predicted_answers)
  rogue_1_score = rogue_1_metric(expected_answers, predicted_answers)
  rogue_l_score = rogue_l_metric(expected_answers, predicted_answers)

  # Save results
  row = pd.DataFrame({
    'llm': llm_name,
    'cos_sim': cos_sim,
    'bleu': bleu_score,
    'rogue_1': rogue_1_score,
    'rogue_l': rogue_l_score,
  }, index=[0])
  df = pd.concat([df, row], ignore_index=True)

df.sort_values(by='cos_sim', ascending=False)

LLMs: 100%|██████████| 3/3 [1:30:46<00:00, 1815.52s/it]  


Unnamed: 0,llm,cos_sim,bleu,rogue_1,rogue_l
0,GPT-4o mini,0.469676,0.007554,0.215925,0.182443
1,Mistral Large,0.468328,0.006592,0.213689,0.180652
2,LLaMA 3.1,0.46715,0.006327,0.18517,0.159363
