In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
import os
from transformers import pipeline
import torch
from datasets import load_dataset
import evaluate
from evaluate import load

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
os.environ["OPENAI_API_KEY"] = "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"


In [None]:
pdf_folder_path = './data/pdfs'
pdf_files = [filename for filename in os.listdir(pdf_folder_path) if not filename.startswith('.')]
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in pdf_files]
index = VectorstoreIndexCreator().from_loaders(loaders)

In [None]:
dataset = load_dataset('csv', data_files=r'C:\Users\adrianhf\Documents\test\Master\data\synthetic_data\question_with_answers.csv', split="train[:10]")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian").to(device)
model = AutoModelForCausalLM.from_pretrained("RuterNorway/Llama-2-13b-chat-norwegian").to(device)

In [None]:
answers_from_model = []

In [None]:
for i in range(10):
    input = index.query_with_sources(dataset[i]["Question"])
    instruction = "Svar på spørsmålet basert på det som står i 'answer'"
    prompt_template=f'''### Instruction: {instruction}
    ### Input: {input}
    ### Response:
    '''
    print("\n\n*** Generate:")
    inputs = tokenizer(prompt_template, return_tensors="pt")

    out = model.generate(**inputs, max_new_tokens=200)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

    # Pipeline prompting
    print("\n\n*** Pipeline:\n\n")
    pipe = pipeline(
        "text-generation",
        model=model,
        do_sample=True,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15
    )
    print(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])
    answers_from_model.append(pipe(prompt_template)[0]['generated_text'][len(prompt_template):])

In [32]:
bertscore = load("bertscore")
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

In [None]:
preds = dataset[:10]["Answer"]
references = answers_from_model  

In [33]:
bert_score = bertscore.compute(predictions=preds, references=references, lang="nb")
bleu_score = bleu.compute(predictions=preds, references=references, max_order=2)
rouge_score = rouge.compute(predictions=preds, references=references)



In [36]:
avg_precision = sum(bert_score['precision']) / len(bert_score['precision'])
avg_recall = sum(bert_score['recall']) / len(bert_score['recall'])
avg_f1 = sum(bert_score['f1']) / len(bert_score['f1'])
print("BLEU SCORES")
print(bleu_score)
print("ROUGE SCORES")
print(rouge_score)
print("BERT SCORES")
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)

BLEU SCORES
{'bleu': 0.04503680447813191, 'precisions': [0.08637236084452975, 0.023483365949119372], 'brevity_penalty': 1.0, 'length_ratio': 4.235772357723577, 'translation_length': 521, 'reference_length': 123}
ROUGE SCORES
{'rouge1': 0.12114382463082675, 'rouge2': 0.047058823529411764, 'rougeL': 0.10144628099173554, 'rougeLsum': 0.11016037777971496}
BERT SCORES
Average Precision: 0.36444942355155946
Average Recall: 0.4268353283405304
Average F1 Score: 0.3924579620361328
