# Test Keyword Extractor
This function tests the keyword extractor model.

In [1]:
from datasets import Dataset
from datetime import datetime as dt
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from transformers.pipelines.pt_utils import KeyDataset

import const
import evaluate
import numpy as np
import pandas as pd
import torch

In [2]:
KW_MODEL_PATH = const.MODELS_FOLDER + "aletheianomous_ai-keyword_extractor-v0.3.1/"
TESTING_DS = const.DATASETS_FOLDER + "squad_ds_keyword_test.csv"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

In [4]:
kw_model = AutoModelForCausalLM.from_pretrained(KW_MODEL_PATH, quantization_config=bnb_config,
    device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(KW_MODEL_PATH + "tokenizer", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True

In [6]:
tokenizer.eos_token

'</s>'

In [7]:
testing_ds = pd.read_csv(TESTING_DS)

In [8]:
testing_ds.loc[0, "text"]

"<|system|>\nYou are a chatbot that assists in providing information to the user. Please generate a keyword from the user's question.</s>\n<|user|>\nIn what country is Normandy located?</s>\n<|assistant|>\ncountry is Normandy located"

In [9]:
def apply_chat_template(user_prompt):
    prompt = ("<|system|>\nYou are a chatbot " +
              "that assists in providing " + 
              "information to the user. " + 
              "Please generate a keyword " + 
              "from the user's question.</s>\n<|user|>\n")
    prompt += user_prompt
    prompt += "</s>\n<|assistant|>\n"
    return prompt

In [10]:
user_prompt = "Who is Da Regular Sauce?"

In [11]:
prompt = apply_chat_template(user_prompt)
kw_pipe = pipeline("text-generation", model=kw_model, tokenizer = tokenizer, torch_dtype=torch.bfloat16,
      device_map="auto", batch_size=1)

In [12]:
sample_out = kw_pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)

In [13]:
sample_out

[{'generated_text': "<|system|>\nYou are a chatbot that assists in providing information to the user. Please generate a keyword from the user's question.</s>\n<|user|>\nWho is Da Regular Sauce?</s>\n<|assistant|>\nDa Regular Sauce"}]

In [14]:
def iterate_prompt(prompt):
    for sample in prompt:
        #print(sample)
        yield sample

In [15]:
def prepare_prompts_for_val(val_feats):
    prompt_ls = []
    i = 1
    val_feats_len = len(val_feats)
    percentage = 0
    for prompt in val_feats:
        (print("Tokenizing prompts...", percentage, "% (", i, "/", val_feats_len, ")", 
              end="                                                           \r"))
        prompt_ls.append(apply_chat_template(prompt))
        i+=1
        percentage = (i/val_feats_len) * 100
    #prompts_df = pd.DataFrame(data={"prompt": prompt_ls})
    #prompt_ls = Dataset.from_pandas(prompts_df)
    return prompt_ls

In [16]:
def predictions_to_np(predictions):
    pred_ls = []
    percent = 0
    item_num = 1
    pred_len = len(predictions)
    for item in predictions:
        #print(item)
        print("Converting predictions to sample...", percent, "% (", item_num, "/", pred_len, ")", end="                      \r")
        pred_ls.append(item[0]['generated_text'])
        item_num+=1
    pred_ls = np.array(pred_ls)
    return pred_ls

In [17]:
def evaluate_kw(model_pipe, val_ds):
    """This function evaluates the keyword extractor model
       by the ROUGE metric.

        PARAMETERS
        model_pipe - The pipeline object that runs the keyword
            extractor model.
        val_ds - The dataset to evaluate the keyword extractor model.

    """

    print("EVALUATING MODEL...\n")
    ds_len = len(val_ds)
    text_labels = val_ds['text'].to_numpy()
    prompts = prepare_prompts_for_val(val_ds['question'].to_numpy())
    
    
    loss_f = evaluate.load('rouge')
    predictions = []
    for out in model_pipe(iterate_prompt(prompts), max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95):
        predictions.append(out)
    #predictions = generate_text(model_pipe, prompts)
    predictions = predictions_to_np(predictions)
    
    
    losses = loss_f.compute(predictions=predictions, references=text_labels)
    return losses
        

In [18]:
start = dt.now()
results = evaluate_kw(kw_pipe, testing_ds[0:5000])
end = dt.now()
print("Duration: " , (end-start))

EVALUATING MODEL...

Duration:  1:23:53.766209 sample... 0 % ( 5000 / 5000 )                      


In [19]:
print("Validation loss metrics:")
print("Rouge-1 score: " + str(results['rouge1'] * 100) + "%")
print("Rouge-2 score: " + str(results['rouge2'] * 100) + "%")
print("Rouge-L score: " + str(results['rougeL'] * 100) + "%")
print("Rouge-L Sum score: " + str(results['rougeLsum'] * 100) + "%")

Validation loss metrics:
Rouge-1 score: 98.83077071559526%
Rouge-2 score: 98.36796605743359%
Rouge-L score: 98.81688987203006%
Rouge-L Sum score: 98.83023886598299%
