pip install --force-reinstall openai==1.8

In [1]:
from datasets import load_dataset
import pandas as pd
from rouge import Rouge
from openai import OpenAI
import json
from settings import *
from utils import preprocess, prompt, score, utils
import os 

from langchain.llms import OpenAI
from langchain import HuggingFaceHub, LLMChain
from langchain.prompts import load_prompt, PromptTemplate
from tqdm import tqdm

from langchain.prompts import PromptTemplate
from langchain import LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np                                                              
config = utils.load_json(CONFIG_DIR)
np.random.seed(2024)

In [2]:
pubmed = utils.load_data(config['data_name'])
pubmed.head(3)

Unnamed: 0,article,abstract
0,a review of the literature and an extensive me...,backgrounda review of the literature and an ex...
1,"nathan , as an oncology fellow , knew well tha...",t cells tell macrophages when to start making ...
2,temporary henna tattoos or pseudotattoo have b...,temporary henna tattoos or pseudotattoos have ...


In [3]:
# MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME = "meta-llama/Llama-2-7b-hf"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

# cache_dir = "/data/ephemeral/Youtube-Short-Generator/models/mistral"
# cache_dir = "/data/ephemeral/Youtube-Short-Generator/models/llama"
cache_dir = "/data/ephemeral/Youtube-Short-Generator/models/llama_chat"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100, device = 0, pad_token_id=tokenizer.eos_token_id)
hf = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# template = """
# <s>[INST]<>You are an abstractive summarizer that follows the output pattern.
# Please revise the extracted summary based on the document. The revised summary should include the information in the extracted summary. 
# Original Document: {document}
# Summarization: <>[/INST]<\s>.
# """


# Llama_chat(In-context)
#  template = """
# <s>[INST]<>You are an extractive summarizer that follows the output pattern.
# The following examples are successful extractive summarization instances: 

# Example Document: 'the three methods of assesing glycemic control , hba1c , smbg and cgms provide distinct information , yet complementary at the same time . hba1c does not equally reflect the glycemic values over the three months that forego its determination . \n hba1c assesses the average glycemic exposure in time without being able to differentiate between preprandial and postprandial glycemia , possible hypo and hyper glycemia . \n this method is able to identify both hypoglycemic and hyperglycemic episodes allowing immediate therapeutic decisions and therefore a glycemic balance closer to normal . \n introduction of cgms in the assessment of the glycemic status represents a great technological advance . \n this glucose monitoring method clears glycemic balance abnormalities in an otherwise impossible to obtain manner , evaluating both therapeutic efficiency and glycemic control . even if cgm systems are far from being implemented at a large scale in current practice , they are about to change the diabetes management by providing an optimal glycemic control .'
# Example Summary: 'type 2 diabetes is a chronic disease and maintaining a tight glycemic control is essential to prevent both microvascular and macrovascular complications , as demonstrated in previous studies . \n it is essential to monitor the glucose levels in order to achieve the targets . \n the blood glucose monitoring can be done by different methods : glycated haemoglobin a1c , self - monitoring of blood glucose ( before and after meals ) with a glucometer and continuous glucose monitoring with a system that measures interstitial glucose concentrations . even though glycated haemoglobin a1c is considered the  gold standard  of diabetes care \n , it does not provide complete information about the magnitude of the glycemic disequilibrium . \n therefore the self - monitoring and continuous monitoring of blood glucose are considered an important adjunct for achieving and maintaining optimal glycemic control . \n the three methods of assessing glycemic control : hba1c , smbg and cgms provide distinct but at the same time complementary information ,'

# Please summarize the following document.The summary should contain 3 sentences.
# Original Document: {document}<>[/INST]<\s>.
# """

In [10]:
def llm(doc):
    template = """
    <s>[INST]<>You are an extractive summarizer that follows the output pattern.
    The following examples are successful extractive summarization instances: 

    Example Document: 'the three methods of assesing glycemic control , hba1c , smbg and cgms provide distinct information , yet complementary at the same time . hba1c does not equally reflect the glycemic values over the three months that forego its determination . \n hba1c assesses the average glycemic exposure in time without being able to differentiate between preprandial and postprandial glycemia , possible hypo and hyper glycemia . \n this method is able to identify both hypoglycemic and hyperglycemic episodes allowing immediate therapeutic decisions and therefore a glycemic balance closer to normal . \n introduction of cgms in the assessment of the glycemic status represents a great technological advance . \n this glucose monitoring method clears glycemic balance abnormalities in an otherwise impossible to obtain manner , evaluating both therapeutic efficiency and glycemic control . even if cgm systems are far from being implemented at a large scale in current practice , they are about to change the diabetes management by providing an optimal glycemic control .'
    Example Summary: 'type 2 diabetes is a chronic disease and maintaining a tight glycemic control is essential to prevent both microvascular and macrovascular complications , as demonstrated in previous studies . \n it is essential to monitor the glucose levels in order to achieve the targets . \n the blood glucose monitoring can be done by different methods : glycated haemoglobin a1c , self - monitoring of blood glucose ( before and after meals ) with a glucometer and continuous glucose monitoring with a system that measures interstitial glucose concentrations . even though glycated haemoglobin a1c is considered the  gold standard  of diabetes care \n , it does not provide complete information about the magnitude of the glycemic disequilibrium . \n therefore the self - monitoring and continuous monitoring of blood glucose are considered an important adjunct for achieving and maintaining optimal glycemic control . \n the three methods of assessing glycemic control : hba1c , smbg and cgms provide distinct but at the same time complementary information ,'
    
    Please summarize the following document.The summary should contain 3 sentences.
    Original Document: {document}<>[/INST]<\s>.
    """
    prompt = PromptTemplate(template=template, input_variables=["document"])
    llm_chain = LLMChain(prompt=prompt, llm=hf)
    response = llm_chain.invoke(input = doc)
    torch.cuda.empty_cache()
    return response['text']

In [9]:
def get_summarization(df,save_name, iter_num = 5):
    for i in range(iter_num):
        response_list = []
        for idx in tqdm(range(len(df)), total = len(df)):
            response = llm(df.iloc[idx,0])
            
            if len(response) > 0:
                response_list.append([response, df.iloc[idx, 1]])
        df = pd.DataFrame(response_list, columns = ['generate', 'abstract'])
        df.to_csv(os.path.join(OUT_DIR, f"{save_name}_{i}.csv"), index = False)

In [None]:
save_name = 'llama_chat'
sample = pubmed.sample(n=500)
result_df = get_summarization(sample, save_name, 3)

In [12]:
model_avg_rouge = score.get_rouge_list_from_all_df(save_name)
model_avg_rouge

Get Rouge List From all Dataframe: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


array([0.23141175])

In [7]:
score.save_rouge_avg(model_avg_rouge,'mistral')

In [9]:
score.statistic_from_rouge_list('mistral_result.npy')

Rouge List:  [0.22415016 0.19500024 0.20017621]
Mean :0.206
Standard Deviation:0.013


(0.206, 0.013)

In [16]:
a = pd.read_csv(os.path.join(OUT_DIR, 'mis_0.csv'))

In [19]:
a.iloc[1,1]

't cells tell macrophages when to start making the toxic soup of lysosomal enzymes , reactive oxygen species , and nitric oxide that destroys intracellular pathogens . in 1983 \n , carl nathan proved that this start signal comes in the form of the secreted cytokine ifn.'

In [20]:
a.iloc[1,0]

' {"summary": [\n      "Nathan, an oncology fellow, observed that chemotherapy lowered patients\' white blood cells and increased their risk of infections.",\n      "Nathan found that the supernatant from activated T cells induced macrophage activation, suggesting a secreted factor.",\n      "Nathan and his collaborators were frustrated with the primitive protein separation methods and lacked the technology to identify the specific protein.",\n      "Nathan discovered that activated macrophages produced hydrogen peroxide, a phenomenon known as the respiratory burst.",\n      "In 1983, Nathan published a seminal paper showing that interferon (IFN) depleted from unpurified T cell supernatants decreased macrophage activation and killing of intracellular protozoa, but adding back recombinant IFN restored the activity.",\n      "Nathan\'s discovery of IFN\'s role in macrophage activation led to the development of treatments for leprosy by injecting recombinant IFN directly into cutaneous le