pip install --force-reinstall openai==1.8

In [1]:
from datasets import load_dataset
import pandas as pd
from rouge import Rouge
from openai import OpenAI
import json
from settings import *
from utils import preprocess, prompt, score, utils
import os 

from langchain.llms import OpenAI
from langchain import HuggingFaceHub, LLMChain
from langchain.prompts import load_prompt, PromptTemplate
from tqdm import tqdm

from langchain.prompts import PromptTemplate
from langchain import LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np                                                              
config = utils.load_json(CONFIG_DIR)
np.random.seed(1)

In [2]:
pubmed = utils.load_data(config['data_name'])
pubmed.head(3)

Unnamed: 0,article,abstract
0,a review of the literature and an extensive me...,backgrounda review of the literature and an ex...
1,"nathan , as an oncology fellow , knew well tha...",t cells tell macrophages when to start making ...
2,temporary henna tattoos or pseudotattoo have b...,temporary henna tattoos or pseudotattoos have ...


In [3]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
cache_dir = "/data/ephemeral/Youtube-Short-Generator/mistral"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100, device = 0, pad_token_id=tokenizer.eos_token_id)
hf = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def get_summarization(df,save_name, iter_num = 5):
    template = """
    <s>[INST]<>You are an abstractive summarizer that follows the output pattern.
    Please revise the extracted summary based on the document. The revised summary should include the information in the extracted summary. Original Document: {document}<>[/INST]<\s>.
    """
    for i in range(iter_num):
        response_list = []
        for idx in tqdm(range(len(df))):
            doc = df.iloc[idx, 0]
            prompt = PromptTemplate(template=template, input_variables=["document"])
            llm_chain = LLMChain(prompt=prompt, llm=hf)
            response = llm_chain.invoke(input = doc)['text']
            if len(response) > 0:
                response_list.append([response, df.iloc[idx, 1]])
        df = pd.DataFrame(response_list, columns = ['generate', 'abstract'])
        df.to_csv(os.path.join(OUT_DIR, f"{save_name}_{i}.csv"), index = False)

In [None]:
save_name = 'mistral'
test = pubmed.sample(n=1000)
get_summarization(test, save_name, 3)

100%|██████████| 1000/1000 [1:27:22<00:00,  5.24s/it]
100%|██████████| 1000/1000 [1:11:29<00:00,  4.29s/it]
100%|██████████| 1000/1000 [1:11:09<00:00,  4.27s/it]


In [5]:
save_name = 'mistral'
model_avg_rouge = score.get_rouge_list_from_all_df(save_name)
model_avg_rouge

Get Rouge List From all Dataframe: 100%|██████████| 3/3 [00:22<00:00,  7.57s/it]


array([0.22415016, 0.19500024, 0.20017621])

In [7]:
score.save_rouge_avg(model_avg_rouge,'mistral')

In [8]:
score.statistic_from_rouge_list('mistral_result.npy')

Rouge List:  [0.22415016 0.19500024 0.20017621]
Mean :0.206
Standard Deviation:0.013


(0.206, 0.013)