# Libraries

In [None]:
import requests
import PyPDF2
import io
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor
import numpy as np

# PDF2Text | _Understanding the Formation of Galaxies with Warm Dark Matter_


In [None]:
url = "https://arxiv.org/pdf/2310.06882.pdf"
response = requests.get(url)

In [None]:
with open('../data/v2/paper_raw.pdf', 'wb') as f:
    f.write(response.content)

In [None]:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
text = ''.join([pdf_reader.pages[page_num].extract_text() for page_num in range(len(pdf_reader.pages))])

In [None]:
with open("../data/v2/paper.txt", "w", encoding="utf-8") as f:
    f.write(text)

# Summarization | _Facebook BART Large CNN_


**Load Model**

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

**Pre-Processing Data**

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("\xa0", "")
    return text.strip()

In [None]:
def process_batch(batch):
    return clean_text(summarizer(batch, max_length=150, min_length=10, do_sample=False)[0]["summary_text"])

In [None]:
text_in_batch = [text[i:i + 1024] for i in range(0, len(text), 1024)]

In [None]:
batches = np.array_split(text_in_batch, 4)

**Generating Summary With Multithreading**

In [None]:
summary = ""
with ThreadPoolExecutor(max_workers=4) as executor: 
    summary = list(tqdm(executor.map(process_batch, text_in_batch)))

# Save the summary


In [None]:
with open("../data/v2/summary.txt", "w", encoding="utf-8") as f:
    for line in summary:
        f.write(line + "\n")