In [1]:
import google.generativeai as genai
from src.Case_Builder import CaseBuilder
from src.Prompt_Factory import PromptFactory
from src.RAG_Calculater import RAG, get_top_n_articles
from src.Result_Calculater import ResultCalculater

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
import pandas as pd
import torch

import time

In [2]:
case_builder = CaseBuilder('BioMistral', 'few_shot', 'BioBERT', "elife", 1)
prompt_factory = PromptFactory(case_builder)

In [3]:
data_train = pd.read_json(f'src/dataset/clean/{case_builder.dataset_name}/{case_builder.bert_version}_train.json')
data_val = pd.read_json(f'src/dataset/clean/{case_builder.dataset_name}/{case_builder.bert_version}_validation.json')
data_test = pd.read_json(f'src/dataset/clean/{case_builder.dataset_name}/{case_builder.bert_version}_test.json')

In [4]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [5]:
genai.configure(api_key="AIzaSyC42OyqZc03g56rzaoC4JkDV9dt7TZ49ic")
model = genai.GenerativeModel("gemini-1.5-flash")

In [6]:
results = []
summaries = []

for idx in range(len(data_test)):
    print(f"\r {idx+1} / {len(data_test)}", end="")
    
    target_row = data_test.loc[idx]
    
    summary = " ".join(target_row['summary'])
    
    ref_rows_indexes = get_top_n_articles(data_train['title_embedding'], target_row['title_embedding'], n=3)
    ref_rows = data_train.loc[ref_rows_indexes].reset_index(drop=True)  
    
    prompt_factory.set_row(target_row, ref_rows)
    prompt = prompt_factory.get_prompt()
    
    response = model.generate_content(prompt)
    if 'lay_summary:' in response.text:answer = response.text.split('lay_summary:')[1].strip()
    elif '**Lay Summary:**' in response.text: answer = response.text.split('**Lay Summary:**')[1].strip()
    else: answer = response.text.strip()
    
    summaries.append(summary)
    results.append(answer)
    
    time.sleep(2)

 241 / 241

In [7]:
for i in results:
    if type(i) != str:
        print(i)
        print(type(i))
        break

In [8]:
model_results = pd.DataFrame({
    'reference': summaries,
    'prediction': results
})

In [9]:
result_calculater = ResultCalculater(case_builder,model_results)
result_df = result_calculater.get_results()

100%|██████████| 49/49 [28:34<00:00, 34.99s/it]
100%|██████████| 49/49 [06:18<00:00,  7.72s/it]


In [10]:
model_results.to_csv(f'results/test_summaries_{case_builder.get_case_signature()}', index=False)
result_df.to_csv(f'results/test_result_{case_builder.get_case_signature()}', index=False)

In [11]:
result_df

Unnamed: 0,ROUGE1,ROUGE2,ROUGEL,BERTScore_Precision,BERTScore_Recall,BERTScore_F1,FKGL,DCRS,BARTScore
0,0.298427,0.070468,0.155124,0.650486,0.564874,0.604214,12.304149,11.115477,3.362549
