In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
import os
from datasets import load_dataset
from deep_translator import GoogleTranslator
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
## Load the pdf and split the text to smaller chunks 
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
## Translating each chunk of text
for document in docs:
    page_content = document.page_content
    translated_content = GoogleTranslator(source='no', target='en').translate(text=page_content)
    document.page_content = translated_content

In [None]:
## If the content is of type None, give it an empty string
for index, doc in enumerate(docs):
    if isinstance(doc.page_content, type(None)):
        docs[index].page_content = ""

In [None]:
## Initialize the database
db = Chroma.from_documents(docs, embeddings)

In [None]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/question_with_answers.csv', split="train[:50]")

In [None]:
list_of_answers = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Provide an answer
You are going to provide an answer to this question: {question}, based off this context: {context}. Give the answer in Norwegian.
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["question", "context"]
)
chain = prompt_template | llm

for num in range(50):
    query = references["Question"][num]
    translated_query = GoogleTranslator(source='no', target='en').translate(text=query)
    found_docs = db.similarity_search(translated_query)
    context="Context"
    for doc in found_docs:
        context+=doc.page_content
    answer = chain.invoke(
        {
            "question": translated_query,
            "context": context,
        }
    )
    translated_answer = GoogleTranslator(source='en', target='no').translate(text=answer.content)
    list_of_answers.append(translated_answer)

In [None]:
refs = references["Answer"]

In [None]:
content_list = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Answer Evaluation
You are given a reference answer and a predicted answer. Your task is to determine whether the predicted answer matches the reference answer correctly. It does not have to be an exact match, but it should be somewhat the same.
- The reference answer is the correct answer.
- The predicted answer is the answer generated by a model or provided by a user.
Your response should indicate whether the predicted answer is correct or not.
Reference answer: {reference}
Predicted answer: {prediction}
Is the predicted answer correct? [Yes/No]
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["prediction", "reference"]
)
chain = prompt_template | llm

for num in range(50):
    score = chain.invoke(
        {
            "reference": refs[num],
            "prediction": list_of_answers[num],
        }
    )
    content_list.append(score.content)

In [None]:
count_yes = content_list.count('Yes')
count_no = content_list.count('No')

# Displaying the counts
print("Number of 'Yes':", count_yes)
print("Number of 'No':", count_no)