# Libraries


In [1]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
import PyPDF2
import requests
import io
import nltk
from tqdm import tqdm

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/elesdes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# PDF2Text | _Understanding the Formation of Galaxies with Warm Dark Matter_


In [3]:
url = "https://arxiv.org/pdf/2310.06882.pdf"
response = requests.get(url)
pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
text = "".join(
    [
        pdf_reader.pages[page_num].extract_text()
        for page_num in range(len(pdf_reader.pages))
    ]
)

# Pre-Processing Data


In [4]:
def split_text_into_batches(text, batch_size=4096):
    sentences = nltk.sent_tokenize(text)

    current_batch = []
    current_batch_size = 0
    batches = []

    for sentence in tqdm(sentences):
        if current_batch_size + len(sentence) <= batch_size:
            current_batch.append(sentence)
            current_batch_size += len(sentence)
        else:
            batches.append(" ".join(current_batch))
            current_batch = [sentence]
            current_batch_size = len(sentence)

    if current_batch:
        batches.append(" ".join(current_batch))

    return batches

In [5]:
def process_batches(batches, tokenizer, model):
    predictions = []

    for batch in tqdm(batches):
        inputs = tokenizer(batch, return_tensors="pt")  # Max length = 4096
        prediction = model.generate(**inputs)
        prediction = tokenizer.batch_decode(prediction, skip_special_tokens=True)
        predictions.extend(prediction)

    return predictions

# Summarization | _BigBird Pegasus Large - Arxiv Dataset Variation_


In [48]:
model_name = "google/bigbird-pegasus-large-arxiv"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    model_name, attention_type="original_full", block_size=32, num_random_blocks=4
)

In [49]:
batches = split_text_into_batches(text)
predictions = process_batches(batches, tokenizer, model)

100%|██████████| 229/229 [00:00<00:00, 1398101.33it/s]
100%|██████████| 8/8 [06:31<00:00, 48.94s/it]


# Clean Summary | _Post-Processing_


In [62]:
import re


def clean_text(input_text):
    cleaned_text = re.sub(r"<s>", "", input_text)
    cleaned_text = " ".join(cleaned_text.split())
    return re.sub(
        r"(?<=\. )(\w)|^\w", lambda match: match.group(0).capitalize(), cleaned_text
    )

# Save the summary


In [63]:
summary = [prediction.split("<n>")[0] for prediction in predictions]
summary = "\n".join(summary)
summary = clean_text(summary)

In [64]:
with open("../data/v4/summary.txt", "w", encoding="utf-8") as file:
    file.write(summary)