# GraphRAG

## Import packages

In [29]:
import os
import re
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import json
from dotenv import load_dotenv
from getpass import getpass

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

from neurorag.neurorag import NeuroRAG

## Disable warnings

In [30]:
import warnings
warnings.filterwarnings('ignore')

## Setup environment variables

You have to define the following environment variables in the `.env` file, terminal environment, or input field within this Jupyter notebook:
1. MISTRAL_API_KEY
2. OPENAI_API_KEY
3. OPENAI_PROXY
4. TAVILY_API_KEY
5. ENTREZ_EMAIL

## Import packages

In [31]:
env_variables = [
  'MISTRAL_API_KEY',
  'OPENAI_API_KEY',
  'OPENAI_PROXY',
  'TAVILY_API_KEY',
  'ENTREZ_EMAIL',
]

load_dotenv()

for key in env_variables:
  value = os.getenv(key)

  if value is None:
    value = getpass(key)

  os.environ[key] = value

## Setup MMLU tests

In [None]:
def extract_json(response):
  json_pattern = r'\{.*?\}'
  match = re.search(json_pattern, response, re.DOTALL)

  if match:
    return match.group().strip().replace('\\\\', '\\')

  return response

In [None]:
class RAGSchema(BaseModel):
  correct_answer: str = Field(description='Given a question and answer options, provide the corresponding letter for the correct answer.')

rag_parser = PydanticOutputParser(pydantic_object=RAGSchema)

rag_template = """Answer the following multiple choice question by giving the most appropriate response in json format. Answer should be one among [A, B, C, D].

{format_instructions}

Question: {question}\n
A) {a}\n
B) {b}\n
C) {c}\n
D) {d}\n

Context:

{context}
"""
prompt = PromptTemplate(
  template=rag_template,
  input_variables=['question', 'a', 'b', 'c', 'd', 'context'],
  partial_variables={'format_instructions': rag_parser.get_format_instructions()},
)

In [38]:
from datasets import load_dataset

letter_to_number = {'a': 0, 'b': 1, 'c': 2, 'd': 3}

def eval_rag(app, mmlu_subset: str) -> float:
  dataset = load_dataset('cais/mmlu', mmlu_subset)
  test_df = dataset['test'].to_pandas()

  correct_answers_count = 0

  for index, row in tqdm(list(test_df.iterrows()), desc='Questions'):
    question = row['question']
    choices = row['choices']
    correct_answer = row['answer']

    while True:
      try:
        prompt_with_choices = prompt.partial(
          a=choices[0],
          b=choices[1],
          c=choices[2],
          d=choices[3],
        )
        llm_answer = app.invoke(question, prompt_with_choices)
        break
      except Exception as e:
        print(index, e)
        pass

    json_string = extract_json(llm_answer['generation'])
    response_object = rag_parser.invoke(json_string)

    llm_answer_letter = response_object.correct_answer.strip().lower()[0]

    if llm_answer_letter not in letter_to_number:
      continue

    llm_answer_num = letter_to_number[llm_answer_letter]

    if llm_answer_num == correct_answer:
      correct_answers_count += 1

  return correct_answers_count / len(test_df)

## Build model

In [87]:
app = NeuroRAG()
app.compile()

## Evaluate RAG

In [None]:
subsets: list[str] = [
  'medical_genetics',
  'college_biology',
  'college_medicine',
]

for subset in subsets:
  print(f'{subset}: {eval_rag(app, subset)}')