\# This notebook has the code for inference**



In [1]:
!pip install transformers
!pip install tokenizer
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install rouge

Collecting tokenizer
  Downloading tokenizer-3.4.3-py2.py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizer
Successfully installed tokenizer-3.4.3
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datase

In [2]:
from google.colab import drive
import pandas as pd
import textwrap
import json
import os
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import logging
transformers_logger = logging.getLogger("transformers")
logging.getLogger("transformers").setLevel(logging.ERROR)
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
from datasets import load_dataset, load_metric
import torch
from rouge import Rouge


# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


**This section contains the code for loading the json file after pdf extraction and the model initialization**

In [4]:
#Load the json file after pdf extraction
dataset_path = "/content/drive/My Drive/Colab Notebooks/CS5242 Project/"

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Join the paths
data_file_path = os.path.join(dataset_path, 'testimage16.json')

# Load  data
pdf_data = load_data(data_file_path)

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Define the Model
class SummarizationModel:
    def __init__(self, model_name, device):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = BigBirdPegasusForConditionalGeneration.from_pretrained(model_name).to(DEVICE)
        self.config=BigBirdPegasusForConditionalGeneration.from_pretrained(model_name).config

#Instantiate the model

model_name = "google/bigbird-pegasus-large-arxiv"
model_summarizer = SummarizationModel(model_name, device=DEVICE)
model = model_summarizer.model
tokenizer=model_summarizer.tokenizer
#print(modelsummarizer.config)

#

**Model Inference**

In [6]:
#Generate Summary for the content using the loaded model
def generate_summary(content):
        max_length=300
        num_beams=4
        inputs = tokenizer(content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs.input_ids.to(DEVICE), max_length=max_length, num_beams=num_beams, early_stopping=True)
        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary_text

#Pase each sections and subsection to generate summaries from the model
def process_section(section,results):

    # Process the content of the each section
    section_summary_results = {}
    content = section["Text"]
    section_name=section["Section"]
    summary_text = generate_summary(content)
    section_summary_results["Section Name"] = section_name
    section_summary_results["Generated Summary"] = summary_text
    results.append(section_summary_results)
    print("Section Name: ", section_name)
    wrapped_output = textwrap.fill(str(summary_text), width=80)
    print("Generated Summary: ", wrapped_output)
        # Process the subsections if they exist
    if "Subsections" in section:
        for subsection in section["Subsections"]:
            process_section(subsection,results)


# Summarize the section contents and subsection contents
def summarize_pdf(pdf_data, output_file):
    all_results = []
    for section in pdf_data:
        process_section(section,all_results)
    with open(output_file, "w") as json_file:
        json.dump(all_results, json_file, indent=4)

#Write the final summary to the summary jsonfile
output_file = "summary_results_googlebigbird.json"
summarize_pdf(pdf_data, output_file)


Section Name:  1 Introduction
Generated Summary:  we present a vision transformer ( vit ) for large scale image recognition.<n> we
split an image into patches and provide the sequence of linear embeddings of
these patches as an input to a trans- former.<n> image patches are treated the
same way as tokens ( words ) in an nlp application.<n> we train the model on
image classication in supervised fashion.<n> when trained on mid sized datasets
such as imagenet without strong regularization, these mod- els yield modest
accuracies of a few percentage points below resnets of comparable size.<n> this
seemingly discouraging outcome may be expected : transformers lack some of the
inductive biases fine tuning code and pre trained models are available at
https://github.com/ google research/vision_transformer arxiv:v jun published as
a conference paper at iclr inherent to cnns, such as translation equivariance
and locality, and therefore do not generalize well when trained on insufcient
amounts of 