\# Install the below libraries**



In [None]:
!pip install transformers
!pip install tokenizer
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install rouge



In [None]:
from google.colab import drive
import pandas as pd
import textwrap
import json
import os
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import logging
transformers_logger = logging.getLogger("transformers")
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere
logging.disable(logging.WARNING)
from transformers import LEDForConditionalGeneration, LEDTokenizer
from datasets import load_dataset, load_metric
import torch
from rouge import Rouge


# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**This section contains the code for loading the json file after pdf extraction and the model definition**

**Please copy the json file after pdf extraction to the Colab Notebooks folder**

**Change the file name under the variable data_file_path**

In [None]:
#Load the json file after pdf extraction
dataset_path = "/content/drive/My Drive/Colab Notebooks/"

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Join the paths
data_file_path = os.path.join(dataset_path, 'PointNet.json')

# Load  data
pdf_data = load_data(data_file_path)

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Define the Model
class SummarizationModel:
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    def __init__(self, model_name, device):
        self.model_name = model_name
        self.tokenizer = LEDTokenizer.from_pretrained(model_name)
        self.model = LEDForConditionalGeneration.from_pretrained(model_name).to(DEVICE)
        self.config=LEDForConditionalGeneration.from_pretrained(model_name).config

    #Function to summarize each section of the pdf
    def generate_summary(self,content,model):

        inputs = self.tokenizer(content, return_tensors="pt", max_length=1024, truncation=True)
        input_ids = inputs["input_ids"].to(self.DEVICE)
        attention_mask = inputs["attention_mask"].to(self.DEVICE)
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1
        summary_ids = model.generate(input_ids,
                                   attention_mask=attention_mask,
                                   global_attention_mask=global_attention_mask,
                                   max_length=1042,
                                   min_length=100,
                                   num_beams=4,
                                   no_repeat_ngram_size=3,
                                   early_stopping=True,
                                   num_return_sequences=1
                                   )
        summary_text = self.tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

        return summary_text

     #Parse each sections and subsection to generate summaries from the model
    def process_section(self,section,results,model):
      section_summary_results = {}
      content = section["Text"]
      section_name=section["Section"]
      summary_text = self.generate_summary(content,model)
      section_summary_results["Section Name"] = section_name
      section_summary_results["Generated Summary"] = summary_text
      results.append(section_summary_results)
      wrapped_output = textwrap.fill(str(summary_text), width=80)
      print("Section Name: ", section_name)
      print("Generated Summary: ", wrapped_output)
        # Process the subsections if they exist
      if "Subsections" in section:
        for subsection in section["Subsections"]:
            model_summarizer.process_section(subsection,results,model)

    # Summarize the section contents and subsection contents
    def summarize_pdf(self,pdf_data, output_file,model):
      all_results = []
      for section in pdf_data:
        self.process_section(section,all_results,model)
      with open(output_file, "w") as json_file:
        json.dump(all_results, json_file, indent=4)

#Instantiate the model

model_name = "allenai/led-large-16384-arxiv"
model_summarizer = SummarizationModel(model_name, device=DEVICE)
model = model_summarizer.model
tokenizer=model_summarizer.tokenizer


**Model Inference using the checkpoint**

In [None]:
#Generate Summary for the content using the loaded model
model_save_name = 'model_checkpoint.pt'
path = F"/content/drive/MyDrive/Colab Notebooks/Checkpoints/{model_save_name}"

model.load_state_dict(torch.load(path))
output_file = "summary_results_models-PointNet.json"
model_summarizer.summarize_pdf(pdf_data, output_file,model)


Section Name:  Abstract
Generated Summary:  ["The text introduces a hierarchical neural network called pointnet++ for deep
hierarchical feature learning on point sets in a metric space. The network is
able to learn deep point set features efficiently and robustly. It utilizes a
nested partitioning of the input point set to learn local features with
increasing contextual scales. by exploiting metric space distances, point sets
are sampled with varying densities, which results in greatly decreased
performance for networks trained on uniform densities. The study of point sets
is challenging due to limitations in capturing local structures induced by
metric space points, limiting the network's ability to recognize fine-grained
patterns and generalizability to complex scenes. with further observation that
point sets typically sampled with different densities results in reduced
performance compared to networks trained with uniform density, novel set
learning layers are proposed to adaptively