In [16]:
from datasets import load_dataset
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
import pandas as pd
import re
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os
from langchain.prompts import PromptTemplate
from deep_translator import GoogleTranslator

In [17]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/vol2_questions_and_answers.csv', split="train[:400]")

In [18]:
os.environ["OPENAI_API_KEY"] = "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"

In [19]:
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
databases = {}
for doc in documents:
    source = doc.metadata['source']
    match = re.search(r'\/([A-Za-z_]+)\.pdf', source)
    if match:
        municipality_name = match.group(1)
    docs = text_splitter.split_documents([doc])
    for document in docs:
        page_content = document.page_content
        translated_content = GoogleTranslator(source='no', target='en').translate(text=page_content)
        document.page_content = translated_content
    for index, doc in enumerate(docs):
        if isinstance(doc.page_content, type(None)):
            docs[index].page_content = ""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    db = FAISS.from_documents(docs, embeddings)
    databases[municipality_name] = db

100%|██████████| 10/10 [00:06<00:00,  1.54it/s]
Created a chunk of size 1147, which is longer than the specified 500
Created a chunk of size 1570, which is longer than the specified 500
Created a chunk of size 639, which is longer than the specified 500
Created a chunk of size 610, which is longer than the specified 500
Created a chunk of size 1008, which is longer than the specified 500
Created a chunk of size 545, which is longer than the specified 500
Created a chunk of size 536, which is longer than the specified 500
Created a chunk of size 583, which is longer than the specified 500
Created a chunk of size 1162, which is longer than the specified 500
Created a chunk of size 607, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 697, which is longer than the specified 500
Created a chunk of size 734, which is longer than the specified 500
Created a chunk of size 916, which is longer than the specified

In [20]:
list_of_answers = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Provide an answer
You are going to provide an answer to this question: {question}, based off this context: {context}.
Keep the answer to no longer than a sentence.
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["question", "context"]
)
chain = prompt_template | llm

for num in range(400):
    query = references["spørsmål"][num]
    translated_query = GoogleTranslator(source='no', target='en').translate(text=query)
    kommunenavn = references["kommunenavn"][num]
    db = databases[kommunenavn]
    found_docs = db.similarity_search(translated_query)
    context="Context"
    for doc in found_docs:
        context+=doc.page_content
    print(translated_query)
    print(context)
    answer = chain.invoke(
        {
            "question": translated_query,
            "context": context,
        }
    )
    print(answer.content)
    translated_answer = GoogleTranslator(source='en', target='no').translate(text=answer.content)
    list_of_answers.append(translated_answer)

What is the date for the adoption of the Local Plan for the city center by the city council?
ContextMunicipal sub-plan for the city centre

Plan provisions

Adopted by the city council on 26 August 2021

1

1.

MAIN INTENTIONS OF THE PLAN ................................................. ................................................ .............. 3

2.

LEGAL EFFECT OF THE PLAN ................................................ ................................................ ...................... 3

3.New, attractive residential and business areas must be arranged to a sufficient extent, at the same time that existing buildings are remodeled to meet new needs.

2. The plan's legal effect

1. The present provisions are linked to the planning map with PlanID: K-201201, last revised 2.2.2021. 2. The municipal sub-plan consists of a plan map, provisions with 4 map annexes and plan description. 3. In the event of a conflict, this plan applies before older area plans.

3. Common provisio

In [21]:
preds = list_of_answers
refs = references["svar"]
content_list = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Answer Evaluation
You are given a reference answer and a predicted answer. Your task is to determine whether the predicted answer matches the reference answer correctly. It does not have to be an exact match, but it should be somewhat the same.
- The reference answer is the correct answer.
- The predicted answer is the answer generated by a model or provided by a user.
Your response should indicate whether the predicted answer is correct or not.
Reference answer: {reference}
Predicted answer: {prediction}
Is the predicted answer correct? [Yes/No]
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["prediction", "reference"]
)
chain = prompt_template | llm

for num in range(400):
    score = chain.invoke(
        {
            "reference": refs[num],
            "prediction": list_of_answers[num],
        }
    )
    content_list.append(score.content)

In [22]:
count_yes = content_list.count('Yes')
count_no = content_list.count('No')

# Displaying the counts
print("Number of 'Yes':", count_yes)
print("Number of 'No':", count_no)

Number of 'Yes': 258
Number of 'No': 127


In [23]:
content_list

['Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes, the predicted answer is correct.',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes, the predicted answer is correct.',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes.',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes, the predicted answer is correct.',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
