In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
import pandas as pd
import torch

from src.RAG_Calculater import RAG, get_top_n_articles
from src.Massege_Factory import massage_factory
from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name,
                              genai_version,
                              genai_model_name,
                              prompt_strategy_used,
                              dataset_name,
                              massage_strategy
                              )

In [5]:
data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json')

In [None]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [None]:
model = AutoModelForCausalLM.from_pretrained(genai_model_name, 
                                              torch_dtype=torch.bfloat16,
                                              low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(genai_model_name)

In [None]:
model.eval()

In [None]:
chatbot = pipeline("text-generation", model="BioMistral/BioMistral-7B", max_new_tokens=512, repetition_penalty=1.2, no_repeat_ngram_size=3)

results = []
summaries = []

In [None]:
for idx in range(len(data_test)):
    print(f"{idx+1} / {len(data_test)}")
    
    target_row = data_test.loc[idx]
    
    if massage_strategy == "few_shot": 
        ref_rows_indexes = get_top_n_articles(data_train, target_row, n=3)
        ref_rows = data_train.loc[ref_rows_indexes]
        
    else: 
        ref_rows = None
    
    massage = massage_factory(massage_strategy, target_row, ref_rows)
    summary = " ".join(data_test.loc[idx, 'summary'])
    
    answer = chatbot(massage)[0]['generated_text'][-1]['content']
    
    results.append(answer)
    summaries.append(summary)
    

In [None]:
model_results = pd.DataFrame({
    'reference': summaries,
    'prediction': results
})

In [None]:
model_results.to_csv(f'results/{genai_version}_ZeroShot_summaries_{bert_version}_{dataset_name}_{prompt_strategy_used}.csv', index=False)