In [1]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
import os
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(





  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embeddings = SentenceTransformerEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

100%|██████████| 10/10 [00:07<00:00,  1.35it/s]
Created a chunk of size 724, which is longer than the specified 500
Created a chunk of size 2724, which is longer than the specified 500
Created a chunk of size 2250, which is longer than the specified 500
Created a chunk of size 2864, which is longer than the specified 500
Created a chunk of size 530, which is longer than the specified 500
Created a chunk of size 665, which is longer than the specified 500
Created a chunk of size 583, which is longer than the specified 500
Created a chunk of size 1162, which is longer than the specified 500
Created a chunk of size 607, which is longer than the specified 500
Created a chunk of size 1206, which is longer than the specified 500
Created a chunk of size 697, which is longer than the specified 500
Created a chunk of size 734, which is longer than the specified 500
Created a chunk of size 916, which is longer than the specified 500
Created a chunk of size 509, which is longer than the specified

In [3]:
db = FAISS.from_documents(docs, embeddings)

In [4]:
dataset = load_dataset('csv', data_files=r'C:\Users\adrianhf\Documents\test\Master\data\synthetic_data\question_with_answers.csv', split="train[:10]")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")
model = AutoModelForCausalLM.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")

Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.21s/it]


In [6]:
answers_from_model = []
for i in range(10):
    query = dataset["Question"][i]
    found_docs = db.similarity_search(query)
    context = found_docs[0].page_content
    input = f"Spørsmål: {query} context: {context}"
    instruction = "Svar på spørsmålet basert på det som står i 'context'"
    prompt_template=f'''### Instruction: {instruction}
    ### Input: {input}
    ### Response:
    '''
    print("\n\n*** Generate:")
    inputs = tokenizer(prompt_template, return_tensors="pt")

    out = model.generate(**inputs, max_new_tokens=200)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

    # Pipeline prompting
    print("\n\n*** Pipeline:\n\n")
    pipe = pipeline(
        "text-generation",
        model=model,
        do_sample=True,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15
    )
    print(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])
    answers_from_model.append(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])



*** Generate:
### Instruction: Svar på spørsmålet basert på det som står i 'context'
    ### Input: Spørsmål: Hva er datoen for vedtaket av Kommunedelplan for sentrum av bystyret? context: SALTDAL KOMMUNE

DETALJREGULERING ADKOMSTVEG PRESTMOEN SKYTEBANEANLEGG, BRENNE GNR 15/22 M.FL. PLANID 2016004

REGULERINGSBESTEMMELSER

Dato for siste revisjon av bestemmelsene Dato for godkjenning av plan

: :

1

Planens hensikt

________________________________________________________________________________________

Reguleringen skal legge til rette for uttak av masser med tilhørende infrastruktur.
    ### Response:
    2016


*** Pipeline:


18. mai 2017


*** Generate:
### Instruction: Svar på spørsmålet basert på det som står i 'context'
    ### Input: Spørsmål: Hva er hovedintensjonene i planen som er beskrevet i dokumentet? context: 1. Planens hensikt Hva som er hovedhensikten med planen, poengtert angitt. Utfyllende beskrivelse skal fremgå av planbeskrivelsen. Hensikten med planen bør for

In [None]:
# Convert the list to a pandas DataFrame
df = pd.DataFrame(answers_from_model, columns=['Text'])

# Specify the file path
file_path = "C:\\Users\\adrianhf\\Documents\\test\\Master\\data\\Results\\Faiss_answers_from_model.csv"

# Write the DataFrame to a CSV file
df.to_csv(file_path, index=False)