In [1]:
import sys
import json
import glob
import tqdm
import pandas as pd
import torch
import evaluate
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
tqdm.tqdm.pandas()
sys.path.append('/home/verma.shi/LLM/LitArt/models')
from summarizer import TextSummaryModel
cache_dir="/work/LitArt/cache"

In [2]:
def load_model_details(path):

    with open(path+"run_config.json") as json_file:
        run_details = json.load(json_file)
    
    base_model_name = run_details["base_model_name"]
    tokenizer_name = run_details["tokenizer_name"]
    cache_dir = run_details["cache_dir"]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name,cache_dir=cache_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)

    checkpoint_location = path+"my_model/version_0/checkpoints/*.ckpt"
    best_checkpoint_location = glob.glob(checkpoint_location)[0]

    model = torch.load(f=best_checkpoint_location,map_location=device)
    keys_to_modify = list(model["state_dict"].keys())  # Create a copy of the keys
    for key in keys_to_modify:
        new_key = key[6:]
        model["state_dict"][new_key] = model["state_dict"][key]
        del model["state_dict"][key]

    summary_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=base_model_name,state_dict=model["state_dict"])

    run_details["best_model_path"] = best_checkpoint_location
    
    return summary_model,base_model,tokenizer,run_details
    

In [3]:
checkpoints_path = "/work/LitArt/verma/google-pegasus-xsum-2024-03-15-19:32:22/"
summary_model,base_model,tokenizer,run_details = load_model_details(checkpoints_path)
device = "cuda" if torch.cuda.is_available() else "cpu"

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
run_details

{'train_path': '/work/LitArt/data/chunked_dataset/train_dataset_with_summaries.csv',
 'test_path': '/work/LitArt/data/chunked_dataset/test_dataset_with_summaries.csv',
 'val_path': '/work/LitArt/data/chunked_dataset/validation_dataset_with_summaries.csv',
 'base_model_name': 'google/pegasus-xsum',
 'tokenizer_name': 'google/pegasus-xsum',
 'cache_dir': '/work/LitArt/cache',
 'batch_size': 32,
 'tokenizer_chapter_max_length': 512,
 'tokenizer_summary_max_length': 64,
 'epochs': 10,
 'log_path': '/work/LitArt/verma/',
 'best_model_path': '/work/LitArt/verma/google-pegasus-xsum-2024-03-15-19:32:22/my_model/version_0/checkpoints/epoch=9-val_loss=1.37.ckpt'}

In [5]:
def summarize(text,model,tokenizer,chapter_length,summary_length,temperature=1,repetition_penalty=1,device='cpu'):
    model = model.to(device)
    text = "Summarize the following : \n" + text
    inputs = tokenizer(text, 
                       max_length=chapter_length,
                       truncation=True,
                       padding="max_length",
                       add_special_tokens=True, 
                       return_tensors="pt").to(device)
    summarized_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"], 
            max_length= summary_length,
            temperature = temperature,
            do_sample = True,
            repetition_penalty = repetition_penalty).to(device)

    return " ".join([tokenizer.decode(token_ids, skip_special_tokens=True)
                    for token_ids in summarized_ids])

In [6]:
test_df = pd.read_csv(run_details["test_path"])
test_df = test_df.sample(n=100,random_state=42) 

In [12]:
test_df

Unnamed: 0,chapter,human_summary,__index_level_0__,summary_text,model_summary,base_model_summary
135,says my idealistic friend what vulgar details ...,book second chapter in which the story pauses ...,613,The passage emphasizes the importance of human...,a friend reflects on the human condition highl...,This is an extract from a poem by the poet Sir...
1281,home with my faith shaken and i have been gett...,the karamazovs go to the meeting with father z...,11763,Faith is shaken due to Pyotr Alexandrovitch's ...,fyodor pavlovitch angrily confronts miuesov ac...,The following is an extract from the book of G...
1117,chapter x crown and tiara aramis was the first...,aramis is filled with suspense as he watches t...,8678,"Aramis helps a prisoner, Philippe, experience ...",aramis walks with the young man in a carriage ...,This is the story of how aramis the prisoner c...
998,their very name is a frightful one for the wor...,not many days after the ship has landed at nuk...,6179,"Typees in Marquesan dialect means cannibals, f...",the typees of nukuheva are known as cannibals ...,My dear readers i am writing to you from our s...
808,is a danger of that chum frink chanted oh say ...,business is brisk that spring so babbitt and m...,1732,"Men discuss beer recipes, prohibition, and sma...",men discuss home-made beer cider and prohibiti...,A few days ago a group of men and women gather...
...,...,...,...,...,...,...
940,re profitable for there s more workmanship nor...,adam returns from work stops in to see his mot...,468,Mr. Poyser and Adam plan to start a business. ...,adam and mrs poyser discuss making a movable k...,mrs poyser sat down beside adam at the dining ...
422,all been of new indulgence more exactly to her...,rosamond has a miscarriage because she goes ou...,3532,"Lydgate tries to spare Rosamond's feelings, bu...",rosamond and lydgate discuss their financial s...,There was a moment when it seemed as if lydgat...
1221,too readily imagine outcast as i am and reject...,evelina continues to write to mr villars of he...,9180,Evelina seeks protection and comfort from Mr. ...,the narrator pleads for continued protection f...,The following is the full text of a letter whi...
247,nigh an hour after i couldn t help laying down...,the first thing adam notices on entering the c...,532,"Witness found a baby's hand under a bush, assu...",a witness found a baby s hand under a nut-bush...,The following is the testimony of one of the w...


In [7]:
rouge = evaluate.load('rouge')

In [8]:
test_df["model_summary"] = test_df["chapter"].progress_apply( lambda text: summarize(text,summary_model,tokenizer,chapter_length=run_details["tokenizer_chapter_max_length"],summary_length=run_details["tokenizer_summary_max_length"],temperature=1.5,repetition_penalty=1.5,device=device))

100%|██████████| 100/100 [00:50<00:00,  1.97it/s]


In [9]:
test_df["base_model_summary"] = test_df["chapter"].progress_apply( lambda text: summarize(text,base_model,tokenizer,chapter_length=run_details["tokenizer_chapter_max_length"],summary_length=run_details["tokenizer_summary_max_length"],temperature=1.5,repetition_penalty=1.5,device=device))

100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


In [10]:
predictions = test_df["model_summary"].to_list()
references = test_df["summary_text"].to_list()
results_model = rouge.compute(predictions=predictions, references=references)
results_model

{'rouge1': 0.283749797758862,
 'rouge2': 0.07062861192743758,
 'rougeL': 0.23015921978427872,
 'rougeLsum': 0.22985554099902414}

In [13]:
predictions = test_df["base_model_summary"].to_list()
references = test_df["summary_text"].to_list()
results_base = rouge.compute(predictions=predictions, references=references)
results_base

{'rouge1': 0.15576442037471938,
 'rouge2': 0.01963816736803319,
 'rougeL': 0.12315978608369169,
 'rougeLsum': 0.12333303678751445}

In [15]:
def calculate_percentage_difference(dict1, dict2):
    percentage_difference = {}

    for metric in dict1.keys():
        difference = dict2[metric] - dict1[metric]
        percentage_diff = (difference / dict1[metric]) * 100
        percentage_difference[metric] = percentage_diff

    for metric, percentage_diff in percentage_difference.items():
        print(f"{metric}: {percentage_diff:.2f}%")

    return percentage_difference

In [17]:
calculate_percentage_difference(results_base, results_model)

rouge1: 82.17%
rouge2: 259.65%
rougeL: 86.88%
rougeLsum: 86.37%


{'rouge1': 82.16598956054966,
 'rouge2': 259.64970968933756,
 'rougeL': 86.87854786291763,
 'rougeLsum': 86.36980567910044}

In [None]:
from pprint import pprint

In [None]:
index = 3
chapter = test_df.iloc[index]["chapter"]
summary = test_df.iloc[index]["summary_text"]

In [None]:
chapter = '''In a quaint village nestled between rolling hills and dense forests, where the cobblestone streets whispered tales of old, lived a young girl named Elara. Her hair was as silver as the moonlight that bathed the village in a gentle glow, and her eyes sparkled with the curiosity of a thousand stars. Elara was no ordinary child; she possessed an innate ability to communicate with the natural world around her, a secret she held close to her heart.

Elara's best friend was an ancient oak tree, standing tall and proud at the edge of the forest, its leaves whispering secrets only she could understand. The villagers spoke of the tree in hushed tones, calling it the Guardian of the Glen, a protector of the village's hidden treasures and mysteries.

One crisp autumn evening, as Elara sat beneath the boughs of the Guardian, conversing in the silent language of rustling leaves, a sudden gust of wind carried a plea for help. The forest was in danger; a darkness had begun to creep through the woods, wilting flowers and withering trees with its malevolent touch.

Determined to save her beloved forest, Elara embarked on a journey guided by the whispers of the wind and the guidance of the Guardian. She ventured deeper into the forest than she had ever dared, her path lit by the luminescence of fireflies, dancing around her like a shimmering cloak.

As she delved into the heart of the darkness, Elara discovered its source: a sorcerer, twisted by his own power, seeking to bend the forest's magic to his will. Realizing the peril, Elara drew upon her connection with the natural world, calling forth an assembly of creatures great and small. Owls took flight, their wings casting shadows like silent warriors; foxes darted through the underbrush, their eyes gleaming with determination; even the trees bent their branches, ready to protect their home.

With the forest at her back, Elara confronted the sorcerer, her resolve as strong as the ancient oak itself. The battle was fierce, with the sorcerer's dark magic clashing against the pure, untamed power of nature. In the end, it was the unity of the forest and the purity of Elara's heart that prevailed, cleansing the woods of the darkness that had sought to consume it.

As dawn broke, painting the sky in hues of pink and gold, the forest thrived once more, its magic restored. The villagers, awestruck by the transformation, celebrated Elara as a hero, though she knew the true victory belonged to the forest itself.

Elara's bond with the natural world had saved the village, and in return, she was gifted with a deeper connection to the magic that flowed through the land. She became the Guardian's apprentice, vowing to protect the balance between the village and the forest, a guardian of secrets, a whisperer to the wind.

And so, the legend of Elara, the girl who spoke to the forest, became a tale passed down through generations, a reminder of the harmony that exists when we listen to the whispers of the natural world.'''

In [None]:
summary = "Elara is a young girl who can communicate with nature. She is a protector of the forest and the Guardian of the Glen."

In [None]:
base_model_summary = summarize(chapter,
                               base_model,
                               tokenizer,
                               chapter_length=run_details["tokenizer_chapter_max_length"],
                               summary_length=run_details["tokenizer_summary_max_length"],
                               temperature=1.5,
                               repetition_penalty=1.5,
                               device=device)
model_summary = summarize(chapter,
                               summary_model,
                               tokenizer,
                               chapter_length=run_details["tokenizer_chapter_max_length"],
                               summary_length=run_details["tokenizer_summary_max_length"],
                               temperature=1.5,
                               repetition_penalty=1.5,
                               device=device)

In [None]:
chapter

"In a quaint village nestled between rolling hills and dense forests, where the cobblestone streets whispered tales of old, lived a young girl named Elara. Her hair was as silver as the moonlight that bathed the village in a gentle glow, and her eyes sparkled with the curiosity of a thousand stars. Elara was no ordinary child; she possessed an innate ability to communicate with the natural world around her, a secret she held close to her heart.\n\nElara's best friend was an ancient oak tree, standing tall and proud at the edge of the forest, its leaves whispering secrets only she could understand. The villagers spoke of the tree in hushed tones, calling it the Guardian of the Glen, a protector of the village's hidden treasures and mysteries.\n\nOne crisp autumn evening, as Elara sat beneath the boughs of the Guardian, conversing in the silent language of rustling leaves, a sudden gust of wind carried a plea for help. The forest was in danger; a darkness had begun to creep through the w

In [None]:
print(f"Base Model Summary : \n {base_model_summary}")
print(f"Fine Tuned Model Summary : \n {model_summary}")
print(f"GPT Summary : \n {summary}")


Base Model Summary : 
 This is the story of Elara, a young girl who embarks on a magical journey to save her beloved forest.
Fine Tuned Model Summary : 
 Elara a young girl in a quaint village communicates with nature, saving her village forest from sorcerer
GPT Summary : 
 Elara is a young girl who can communicate with nature. She is a protector of the forest and the Guardian of the Glen.
