\# This notebook has steps for model initialisation and Training**



In [33]:
!pip install transformers
!pip install tokenizer
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install rouge



In [34]:
from google.colab import drive
import pandas as pd
import textwrap
import json
import os
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import logging
transformers_logger = logging.getLogger("transformers")
logging.getLogger("transformers").setLevel(logging.ERROR)
from transformers import LEDForConditionalGeneration, LEDTokenizer
from datasets import load_dataset, load_metric
import torch
from rouge import Rouge


# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**This section contains the code for loading the json file after pdf extraction and the model initialization**

In [35]:
#Load the json file after pdf extraction
dataset_path = "/content/drive/My Drive/Colab Notebooks/CS5242 Project/"

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Join the paths
data_file_path = os.path.join(dataset_path, 'testimage16.json')

# Load  data
pdf_data = load_data(data_file_path)

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Define the Model
class SummarizationModel:
    def __init__(self, model_name, device):
        self.model_name = model_name
        self.tokenizer = LEDTokenizer.from_pretrained(model_name)
        self.model = LEDForConditionalGeneration.from_pretrained(model_name).to(DEVICE)
        self.config=LEDForConditionalGeneration.from_pretrained(model_name).config

#Instantiate the model

model_name = "allenai/led-large-16384-arxiv"
model_summarizer = SummarizationModel(model_name, device=DEVICE)
model = model_summarizer.model
tokenizer=model_summarizer.tokenizer
#print(modelsummarizer.config)

#

**Model Inference**

In [36]:
#Generate Summary for the content using the loaded model
def generate_summary(content):
        max_length=300
        num_beams=4
        inputs = tokenizer(content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs.input_ids.to(DEVICE), max_length=max_length, num_beams=num_beams, early_stopping=True)
        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary_text

#Pase each sections and subsection to generate summaries from the model
def process_section(section,results):

    # Process the content of the each section
    section_summary_results = {}
    content = section["Text"]
    section_name=section["Section"]
    summary_text = generate_summary(content)
    section_summary_results["Section Name"] = section_name
    section_summary_results["Generated Summary"] = summary_text
    results.append(section_summary_results)
    print("Section Name: ", section_name)
    wrapped_output = textwrap.fill(str(summary_text), width=80)
    print("Generated Summary: ", wrapped_output)
        # Process the subsections if they exist
    if "Subsections" in section:
        for subsection in section["Subsections"]:
            process_section(subsection,results)


# Summarize the section contents and subsection contents
def summarize_pdf(pdf_data, output_file):
    all_results = []
    for section in pdf_data:
        process_section(section,all_results)
    with open(output_file, "w") as json_file:
        json.dump(all_results, json_file, indent=4)

#Write the final summary to the summary jsonfile
output_file = "summary_results_allenai.json"
summarize_pdf(pdf_data, output_file)


Section Name:  1 Introduction
Generated Summary:   self attention based architectures, in particular transformers, have become the
model of choice in natural language processing (nlp ). inspired by the
transformer scaling successes in nlp, we experiment with applying a standard
transformer directly to images, with the fewest possible modications. to do so,
we split an image into patches and provide the sequence of linear embeddings of
these patches as an input to a trans- former. image patches are treated the same
way as tokens (words ) in an nlp application.   we train the model on image
classication in supervised fashion. when trained on mid sized datasets such as
imagenet without strong regularization, these mod- els yield modest accuracies
of a few percentage points below resnets of comparable size.   however, the
picture changes if the models are trained on larger datasets ( m m images ).
we nd that large scale training trumps inductive bias.
Section Name:  2 Related Work
Generate