In [None]:
import os
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain.indexes import VectorstoreIndexCreator

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [None]:
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)

In [None]:
documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=70)

In [None]:
texts = text_splitter.split_documents(documents)

In [None]:
from langchain_community.vectorstores import Qdrant
qdrant = Qdrant.from_documents(
    texts,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)

In [None]:
query = "Jeg tenker å utnytte frittliggende fritidsbebyggelse. Hva er maks % BYA?"
found_docs = qdrant.similarity_search(query)

In [None]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/question_with_answers.csv', split="train[:10]")
#references = references["Answer"]

In [None]:
for i in range(10):
    query = references["Question"][i]
    found_docs = qdrant.similarity_search(query)
    print(f'{query} #####################\n')
    print(f'{found_docs[0].page_content}\n')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")
model = AutoModelForCausalLM.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian")

In [None]:
answers_from_model = []
for i in range(10):
    query = references["Question"][i]
    found_docs = qdrant.similarity_search(query)
    context = found_docs[0].page_content
    input = "Spørsmål: {query} context: {context}"
    instruction = "Svar på spørsmålet basert på det som står i 'context'"
    prompt_template=f'''### Instruction: {instruction}
    ### Input: {input}
    ### Response:
    '''
    print("\n\n*** Generate:")
    inputs = tokenizer(prompt_template, return_tensors="pt")

    out = model.generate(**inputs, max_new_tokens=200)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

    # Pipeline prompting
    print("\n\n*** Pipeline:\n\n")
    pipe = pipeline(
        "text-generation",
        model=model,
        do_sample=True,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15
    )
    print(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])
    answers_from_model.append(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])