In [None]:
%pip install langchain_core langchain_community langchain langchain_pinecone jq pypdf bs4 pandas numpy pinecone-client datasets ragas

import os

os.environ["PINECONE_API_KEY"] = "921f776d-0d66-4d45-8026-5eea98f28936"
os.environ["PINECONE_API_ENV"] = "us-east-1"

from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

from langchain.prompts import ChatPromptTemplate

template = """
You are an AI assistant, trained to provide understandable and accurate information about pharmacogenomics and drugs.
You will base your responses on the context and information provided. Output both your answer and a score of how confident you are,
 and also cite the references. Also provide the source of the chunks of the documents used for response.
If the information related to the question is not in the context and or in the information provided in the prompt, 
you will say 'I don't know'.
You are not a healthcare provider and you will not provide medical care or make assumptions about treatment.


Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.csv_loader import CSVLoader

folder_path = "/home/dhanushb/Wellytics/RAG_data/all_files"
jsondata = []
csvdata = []
pdfdocs = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(folder_path, filename)
        loader = PyPDFLoader(file_path)
        doc = loader.load()
        pdfdocs.extend(doc)
    elif filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        loader = CSVLoader(file_path)
        data = loader.load()
        csvdata.extend(data)
    elif filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        loader = JSONLoader(file_path, jq_schema=".",json_lines=False,text_content=False)
        data = loader.load()
        jsondata.extend(data)

for doc in pdfdocs:
    doc.page_content = doc.page_content.replace('\t', ' ')

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=1000)
documents = text_splitter.split_documents(pdfdocs)
jsondocs = text_splitter.split_documents(jsondata)

documents += jsondocs + csvdata 


from langchain_community.vectorstores import Pinecone

vectorstore = Pinecone.from_documents(documents, embedding=embeddings, index_name="rag-data")

from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(embedding=embeddings, index_name="rag-data")

retriever = vectorstore.as_retriever()

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever, question=RunnablePassthrough())

import numpy as np
import pandas as pd

Questions = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/Questions.csv")

questions = Questions["Questions"].to_list()

cont = []
for question in questions:
    cont.append([docs.page_content for docs in retriever.invoke(question)])
Questions["Context"] = cont

from langchain_community.llms import Ollama

MODELS = ["mistral", "gemma", "llama2", "llama3"]

for MODEL in MODELS:
    model = Ollama(model=MODEL)
    chain = setup | prompt | model | parser
    resp = []
    n = len(questions)
    i = 1
    for question in questions:
        resp.append(chain.invoke(question))
        Questions[MODEL + "_resp"] = resp + [np.nan] * (n - i)
        Questions.to_csv("/home/dhanushb/Wellytics/RAG_data/Questions.csv")
        i += 1

from datasets import Dataset
from ragas import evaluate
from ragas.metrics.critique import harmfulness, strictness, 
from ragas.metrics import (
    faithfulness,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_relevancy,
    answer_similarity,
    answer_correctness
)

quests = Questions[Questions["Expected_response"].notna()]

for MODEL in MODELS:

    # To dict
    data = {
        "question": list(map(str, quests["Questions"].to_list())),
        "answer": list(map(str, quests[MODEL + "_resp"].to_list())),
        "contexts": [[str(context)] for context in quests["context"].to_list()],
        "ground_truth": list(map(str, quests["Expected_response"].to_list()))
    }

    # Convert dict to dataset
    dataset = Dataset.from_dict(data)

    result = evaluate(
        dataset = dataset, 
        metrics=[
            context_entity_recall,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
            harmfulness
        ],
    )

    df = result.to_pandas()

    df.to_csv("/home/dhanushb/Wellytics/RAG_data/" + MODEL + "_eval.csv")