In [None]:
from langchain.agents import AgentExecutor, create_react_agent
from datasets import load_dataset
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
import re
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import os
from deep_translator import GoogleTranslator
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate

In [None]:
instances = 400

In [None]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/vol2_questions_and_answers.csv', split=f"train[:{instances}]")

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
loader = DirectoryLoader('../data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
databases = {}
for doc in documents:
    source = doc.metadata['source']
    match = re.search(r'\/([A-Za-z_]+)\.pdf', source)
    if match:
        municipality_name = match.group(1)
    docs = text_splitter.split_documents([doc])
    for document in docs:
        page_content = document.page_content
        translated_content = GoogleTranslator(source='no', target='en').translate(text=page_content)
        document.page_content = translated_content
    for index, doc in enumerate(docs):
        if isinstance(doc.page_content, type(None)):
            docs[index].page_content = ""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    db = FAISS.from_documents(docs, embeddings)
    databases[municipality_name] = db

In [None]:
# Get the prompt to use - you can modify this!
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")
prompt = PromptTemplate(template="""Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question. Answer in Norwegian

Few-shot examples:
Question: What is the date of the decision on the Municipal Plan for the city center by the city council?
Final Answer: The date of the decision on the Municipal Plan for the city center by the city council is August 26, 2021.

Question: Where can one find common regulations in the document about Kristiansund?
Final Answer: Common regulations can be found on page 3 of the document about Kristiansund.

Question: What is the main theme of section 3.6 in the document?
Final Answer: The main theme of section 3.6 in the document is public areas.

Question: What is the subject of chapter 4 in the document from Kristiansund?
Final Answer: The subject of chapter 4 in the document from Kristiansund is buildings and facilities.
                        
Begin!

Question: {input}
Thought: {agent_scratchpad} 
""", input_variables=["tool_names", "tools", "input", "agent_scratchpad"])




In [None]:
list_of_answers = []
for num in range(instances):
    query = references["spørsmål"][num]
    translated_query = GoogleTranslator(source='no', target='en').translate(text=query)
    kommunenavn = references["kommunenavn"][num]
    db = databases[kommunenavn]
    found_docs = db.similarity_search(query)
    retriever = db.as_retriever()
    tool = create_retriever_tool(
        retriever,
        "search_planning_regulations",
        "Searches and returns excerpts planning regulation documents from different municipalities",
    )
    tools = [tool]
    agent = create_react_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)
    answer = agent_executor.invoke({"input": {query}})
    print("########### HER ER ANSWER")
    print(answer)
    translated_answer = GoogleTranslator(source='en', target='no').translate(text=answer["output"])
    list_of_answers.append(translated_answer)

In [None]:
refs = references["svar"][:instances]

In [None]:
preds = list_of_answers
refs = references["svar"]
content_list = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Answer Evaluation
You are given a reference answer and a predicted answer. Your task is to determine whether the predicted answer matches the reference answer correctly. It does not have to be an exact match, but it should be somewhat the same.
- The reference answer is the correct answer.
- The predicted answer is the answer generated by a model or provided by a user.
Your response should indicate whether the predicted answer is correct or not.
Reference answer: {reference}
Predicted answer: {prediction}
Is the predicted answer correct? [Yes/No]
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["prediction", "reference"]
)
chain = prompt_template | llm

for num in range(instances):
    score = chain.invoke(
        {
            "reference": refs[num],
            "prediction": list_of_answers[num],
        }
    )
    content_list.append(score.content)

In [None]:
count_yes = content_list.count('Yes') + content_list.count('Yes.')
count_no = content_list.count('No') + content_list.count('No.')

# Displaying the counts
print("Number of 'Yes':", count_yes)
print("Number of 'No':", count_no)